1use bon::builder;
2use crop::RopeSlice;
3use log::trace;
4
5#[derive(Debug, Clone, Copy, Eq, PartialEq)]
6pub enum Capitalize {
7 False,
8 True,
9}
10
11#[derive(Debug, Default, Eq, PartialEq)]
12pub(crate) enum BreakOnPunctuation {
13 #[default]
14 None,
15 #[allow(dead_code)]
16 Hyphen,
17}
18
19#[derive(Debug, Default, Eq, PartialEq)]
20pub(crate) enum CapitalizeTriggerPunctuation {
21 #[default]
22 Standard,
23 PlusColon,
24}
25
26#[derive(Debug)]
27pub struct WordIterator<'rope> {
28 rope: RopeSlice<'rope>,
29 offset_from_parent: usize,
30 parser: WordParser,
31}
32
33pub(crate) struct WordIteratorOptions {
34 pub(crate) initial_capitalize: Capitalize,
35 pub(crate) break_on_punctuation: BreakOnPunctuation,
36 pub(crate) capitalize_trigger_punctuation: CapitalizeTriggerPunctuation,
37}
38
39impl Default for WordIteratorOptions {
40 fn default() -> Self {
41 Self {
42 initial_capitalize: Capitalize::False,
43 break_on_punctuation: Default::default(),
44 capitalize_trigger_punctuation: Default::default(),
45 }
46 }
47}
48
49impl<'rope> WordIterator<'rope> {
50 pub fn new(
51 rope: RopeSlice<'rope>,
52 offset_from_parent: usize,
53 options: WordIteratorOptions,
54 ) -> Self {
55 Self {
56 rope,
57 offset_from_parent,
58 parser: WordParser::new(
59 options.initial_capitalize,
60 options.break_on_punctuation,
61 options.capitalize_trigger_punctuation,
62 ),
63 }
64 }
65
66 pub fn curr_index(&self) -> Option<usize> {
67 if let ParseState::Initial = self.parser.state {
68 assert!(self.parser.word_start_offset == self.parser.tracking_offset);
69 Some(self.parser.word_start_offset)
70 } else {
71 None
72 }
73 }
74
75 pub fn next_capitalize(&self) -> Option<Capitalize> {
76 if let ParseState::Initial = self.parser.state {
77 Some(self.parser.capitalize)
78 } else {
79 None
80 }
81 }
82
83 pub(crate) fn collect_remainder(self) -> Option<String> {
84 assert!(self.parser.word_start_offset == self.parser.tracking_offset);
85 if self.parser.word_start_offset == self.rope.byte_len() {
86 None
87 } else {
88 Some(
89 self.rope
90 .byte_slice(self.parser.word_start_offset..)
91 .to_string(),
92 )
93 }
94 }
95}
96
97pub(crate) type WordIteratorItem<'r> = (usize, RopeSlice<'r>, Capitalize);
98
99impl<'rope> Iterator for WordIterator<'rope> {
100 type Item = WordIteratorItem<'rope>;
101
102 fn next(&mut self) -> Option<Self::Item> {
103 let next_word_data = self.parser.parse(self.rope);
104
105 if let Some((offset, slice, capitalize)) = next_word_data {
106 Some((offset + self.offset_from_parent, slice, capitalize))
107 } else {
108 None
109 }
110 }
111}
112
113#[derive(Debug)]
114struct WordParser {
115 state: ParseState,
116 word_start_offset: usize,
117 tracking_offset: usize,
118 capitalize: Capitalize,
119 break_on_punctuation: BreakOnPunctuation,
120 capitalize_trigger_punctuation: CapitalizeTriggerPunctuation,
121}
122
123#[derive(Debug, Default)]
124enum ParseState {
125 #[default]
126 Initial,
127 AsciiAlphabetic,
128 OtherAlphabetic,
129 Numeric,
130 Whitespace,
131 Escape,
132 PostEscape,
133 PunctuationLeading(String),
134 PunctuationTrailing(String),
135 Other,
136}
137
138#[derive(Debug, Clone)]
139enum ParserNext {
140 Continue,
141 Break(usize, usize, Capitalize),
142}
143
144impl WordParser {
145 fn new(
146 initial_capitalize: Capitalize,
147 break_on_punctuation: BreakOnPunctuation,
148 capitalize_trigger_punctuation: CapitalizeTriggerPunctuation,
149 ) -> Self {
150 Self {
151 state: ParseState::Initial,
152 word_start_offset: 0,
153 tracking_offset: 0,
154 capitalize: initial_capitalize,
155 break_on_punctuation,
156 capitalize_trigger_punctuation,
157 }
158 }
159
160 fn parse<'rope>(&mut self, rope: RopeSlice<'rope>) -> Option<WordIteratorItem<'rope>> {
161 assert!(self.word_start_offset == self.tracking_offset);
162 if self.word_start_offset >= rope.byte_len() {
163 return None;
164 }
165
166 let chars = rope.byte_slice(self.word_start_offset..).chars();
167 for c in chars {
168 trace!("Parser loop iteration:");
169 trace!(" state: {:?}", self.state);
170 trace!(" word_start_offset: {}", self.word_start_offset);
171 trace!(" tracking_offset: {}", self.tracking_offset);
172 trace!(
173 " word so far: {}",
174 rope.byte_slice(self.word_start_offset..self.tracking_offset)
175 );
176 trace!(" char: {c}");
177
178 let next = match c {
179 c if c.is_ascii_alphabetic() => self.consume_ascii_alphabetic(),
180 '0'..='9' => self.consume_numeric(),
181 _ if c.is_alphabetic() => self.consume_other_alphabetic(c),
182 _ if c.is_whitespace() => self.consume_whitespace(c),
183 '\\' => self.consume_escape(),
184 _ if is_punctuation(&c) => self.consume_punctuation(c),
185 _ => self.consume_other(c),
186 };
187
188 if let ParserNext::Break(start, end, capitalize) = next {
189 trace!("Break parser at word end with start: {start}, end: {end}");
190 self.word_start_offset = self.tracking_offset;
191 return Some((start, rope.byte_slice(start..end), capitalize));
192 }
193 }
194
195 let saved_start_offset = self.word_start_offset;
196 let word_end_offset = self.calculate_final_word_end_offset();
197
198 self.state = ParseState::Initial;
200 self.word_start_offset = self.tracking_offset;
201
202 if saved_start_offset == word_end_offset {
203 None
204 } else {
205 Some((
206 saved_start_offset,
207 rope.byte_slice(saved_start_offset..word_end_offset),
208 self.capitalize,
209 ))
210 }
211 }
212
213 fn consume_ascii_alphabetic(&mut self) -> ParserNext {
214 trace!("consume_ascii_alphabetic");
215 match &self.state {
216 ParseState::Escape => {
217 self.state = ParseState::PostEscape;
218 self.tracking_offset += 1;
219 ParserNext::Continue
220 }
221 _ => {
222 self.state = ParseState::AsciiAlphabetic;
223 self.tracking_offset += 1;
224 ParserNext::Continue
225 }
226 }
227 }
228
229 fn consume_other_alphabetic(&mut self, c: char) -> ParserNext {
230 trace!("consume_other_alphabetic: {c}");
231 match &self.state {
232 ParseState::Escape => {
233 self.state = ParseState::PostEscape;
234 self.tracking_offset += c.len_utf8();
235 ParserNext::Continue
236 }
237 _ => {
238 self.state = ParseState::OtherAlphabetic;
239 self.tracking_offset += c.len_utf8();
240 ParserNext::Continue
241 }
242 }
243 }
244
245 fn consume_numeric(&mut self) -> ParserNext {
246 trace!("consume_numeric");
247 match &self.state {
248 ParseState::Escape => {
249 self.state = ParseState::PostEscape;
250 self.tracking_offset += 1;
251 ParserNext::Continue
252 }
253 _ => {
254 self.state = ParseState::Numeric;
255 self.tracking_offset += 1;
256 ParserNext::Continue
257 }
258 }
259 }
260
261 fn consume_whitespace(&mut self, c: char) -> ParserNext {
262 trace!("consume_whitespace: {c}");
263 match &self.state {
264 ParseState::Initial | ParseState::PunctuationLeading(_) => {
265 self.state = ParseState::Whitespace;
266 self.word_start_offset += c.len_utf8();
267 self.tracking_offset += c.len_utf8();
268 ParserNext::Continue
269 }
270 ParseState::AsciiAlphabetic
271 | ParseState::OtherAlphabetic
272 | ParseState::Numeric
273 | ParseState::Other
274 | ParseState::PostEscape => {
275 let word_end_offset = self.tracking_offset;
276 let curr_capitalize = self.capitalize;
277
278 self.state = ParseState::Initial;
279 self.tracking_offset += c.len_utf8();
280 self.capitalize = Capitalize::False;
281
282 ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
283 }
284 ParseState::Whitespace => {
285 self.word_start_offset += c.len_utf8();
286 self.tracking_offset += c.len_utf8();
287 ParserNext::Continue
288 }
289 ParseState::Escape => {
290 self.state = ParseState::PostEscape;
291 self.tracking_offset += c.len_utf8();
292 ParserNext::Continue
293 }
294 ParseState::PunctuationTrailing(punctuation) => {
295 let preserve_punctuation = punctuation == "-";
298
299 let word_end_offset = if preserve_punctuation {
300 self.tracking_offset
301 } else {
302 self.tracking_offset.saturating_sub(punctuation.len())
303 };
304 let curr_capitalize = self.capitalize;
305
306 if let Some(p) = punctuation.chars().last() {
307 self.capitalize = self.punc_triggers_capitalization(&p);
308 }
309 self.state = ParseState::Initial;
310 self.tracking_offset += c.len_utf8();
311
312 ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
313 }
314 }
315 }
316
317 fn consume_punctuation(&mut self, c: char) -> ParserNext {
318 trace!("consume_punctuation: {c}");
319 match &self.state {
320 ParseState::Initial | ParseState::Whitespace => {
321 self.state = ParseState::PunctuationLeading(c.to_string());
322 self.word_start_offset += c.len_utf8();
323 self.tracking_offset += c.len_utf8();
324 ParserNext::Continue
325 }
326 ParseState::AsciiAlphabetic
327 | ParseState::OtherAlphabetic
328 | ParseState::Numeric
329 | ParseState::Other
330 | ParseState::PostEscape => {
331 if self.break_word_immediately_on_puncutation(&c) {
332 let word_end_offset = self.tracking_offset;
333 let curr_capitalize = self.capitalize;
334
335 self.capitalize = self.punc_triggers_capitalization(&c);
336 self.state = ParseState::Initial;
337 self.tracking_offset += c.len_utf8();
338
339 ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
340 } else {
341 self.state = ParseState::PunctuationTrailing(c.to_string());
342 self.tracking_offset += c.len_utf8();
343 ParserNext::Continue
344 }
345 }
346 ParseState::Escape => {
347 self.state = ParseState::PostEscape;
348 self.tracking_offset += c.len_utf8();
349 ParserNext::Continue
350 }
351 ParseState::PunctuationLeading(punctuation) => {
352 self.state = ParseState::PunctuationLeading(format!("{punctuation}{c}"));
353 self.word_start_offset += c.len_utf8();
354 self.tracking_offset += c.len_utf8();
355 ParserNext::Continue
356 }
357 ParseState::PunctuationTrailing(punctuation) => {
358 if self.break_word_immediately_on_puncutation(&c) {
359 let word_end_offset = self.tracking_offset.saturating_sub(punctuation.len());
360 let curr_capitalize = self.capitalize;
361
362 self.capitalize = self.punc_triggers_capitalization(&c);
363 self.state = ParseState::Initial;
364 self.tracking_offset += c.len_utf8();
365
366 ParserNext::Break(self.word_start_offset, word_end_offset, curr_capitalize)
367 } else {
368 self.state = ParseState::PunctuationTrailing(format!("{punctuation}{c}"));
369 self.tracking_offset += c.len_utf8();
370 ParserNext::Continue
371 }
372 }
373 }
374 }
375
376 fn consume_escape(&mut self) -> ParserNext {
377 trace!("consume_escape");
378 match &self.state {
379 ParseState::Escape => {
380 self.state = ParseState::PostEscape;
381 self.tracking_offset += 1;
382 ParserNext::Continue
383 }
384 _ => {
385 self.state = ParseState::Escape;
386 self.tracking_offset += 1;
387 ParserNext::Continue
388 }
389 }
390 }
391
392 fn consume_other(&mut self, c: char) -> ParserNext {
393 trace!("consume_other: {c}");
394 match &self.state {
395 ParseState::Escape => {
396 self.state = ParseState::PostEscape;
397 self.tracking_offset += c.len_utf8();
398 ParserNext::Continue
399 }
400 _ => {
401 self.state = ParseState::Other;
402 self.tracking_offset += c.len_utf8();
403 ParserNext::Continue
404 }
405 }
406 }
407
408 fn calculate_final_word_end_offset(&self) -> usize {
409 match &self.state {
410 ParseState::PunctuationTrailing(punctuation) => {
411 self.tracking_offset.saturating_sub(punctuation.len())
412 }
413 _ => self.tracking_offset,
414 }
415 }
416
417 fn punc_triggers_capitalization_std(c: &char) -> bool {
418 *c == '!' || *c == '?' || *c == '.'
419 }
420
421 fn punc_triggers_capitalization(&self, c: &char) -> Capitalize {
422 if Self::punc_triggers_capitalization_std(c)
423 || *c == ':'
424 && matches!(
425 self.capitalize_trigger_punctuation,
426 CapitalizeTriggerPunctuation::PlusColon
427 )
428 {
429 Capitalize::True
430 } else {
431 Capitalize::False
432 }
433 }
434
435 fn break_on_hyphens(&self) -> bool {
436 matches!(self.break_on_punctuation, BreakOnPunctuation::Hyphen)
437 }
438
439 fn break_word_immediately_on_puncutation(&self, c: &char) -> bool {
440 match c {
441 '–' | '—' | '―' => true,
442 '-' => self.break_on_hyphens(),
443 _ => false,
444 }
445 }
446}
447
448pub fn is_punctuation(c: &char) -> bool {
449 *c == '!'
450 || *c == '-'
451 || *c == '–'
452 || *c == '—'
453 || *c == '―'
454 || *c == '('
455 || *c == ')'
456 || *c == '['
457 || *c == ']'
458 || *c == '{'
459 || *c == '}'
460 || *c == ':'
461 || *c == '\''
462 || *c == '‘'
463 || *c == '’'
464 || *c == '“'
465 || *c == '”'
466 || *c == '"'
467 || *c == '?'
468 || *c == ','
469 || *c == '.'
470 || *c == ';'
471}
472
473const SENTENCE_ENDING_PUNCTUATION: &[char] = &['.', '!', '?', '…'];
474
475fn is_sentence_ending_punctuation(c: &char) -> bool {
476 SENTENCE_ENDING_PUNCTUATION.contains(c)
477}
478
479#[builder]
480pub(crate) fn is_sentence_start(
481 slice: RopeSlice<'_>,
482 query_offset: usize,
483 #[builder(default = true)] count_beginning_as_sentence_start: bool,
484) -> bool {
485 #[cfg(debug_assertions)]
486 log::trace!("Checking if offset {query_offset} is at sentence start");
487
488 let mut iter = WordIterator::new(slice, 0, Default::default())
489 .enumerate()
490 .peekable();
491
492 let (preceding_offset, preceding_word, queried_offset, queried_word) = loop {
493 match (iter.next(), iter.peek()) {
494 (Some((0, _)), _) if query_offset == 0 && count_beginning_as_sentence_start => {
495 return count_beginning_as_sentence_start;
496 }
497 (
498 Some((_, (preceding_offset, preceding_word, _))),
499 Some((_, (next_word_offset, next_word, _))),
500 ) => {
501 if *next_word_offset == query_offset {
502 break (
503 preceding_offset,
504 preceding_word,
505 next_word_offset,
506 next_word,
507 );
508 }
509 }
510 _ => {
511 return false;
512 }
513 }
514 };
515
516 if !(queried_word.is_char_boundary(0)
528 && queried_word
529 .chars()
530 .next()
531 .is_some_and(|c: char| c.is_uppercase()))
532 {
533 return false;
534 }
535
536 let between = slice
537 .byte_slice(preceding_offset + preceding_word.byte_len()..*queried_offset)
538 .chars();
539 #[cfg(debug_assertions)]
540 trace!(
541 "Parsing the between-sentence text: \"{}\"",
542 between.clone().collect::<String>()
543 );
544 between_sentence_parser::BetweenSentenceParser::new().parse(between)
545}
546
547mod between_sentence_parser {
548 #[cfg(debug_assertions)]
549 use log::trace;
550
551 #[derive(Debug)]
552 enum BetweenSentenceParserState {
553 Initial,
554 PrecedingPunctuation,
555 SentenceEndingPunctuation(EndingPunctuationType),
556 FollowingPunctuation,
557 Whitespace,
558 SentenceStartPunctuation,
559 }
560
561 #[derive(Debug)]
562 enum EndingPunctuationType {
563 Mixable,
564 NonMixable(char),
565 }
566
567 #[derive(Debug)]
568 pub(super) struct BetweenSentenceParser {
569 state: BetweenSentenceParserState,
570 }
571
572 impl BetweenSentenceParser {
573 pub(super) fn new() -> Self {
574 Self {
575 state: BetweenSentenceParserState::Initial,
576 }
577 }
578
579 pub(super) fn parse(&mut self, chars: impl Iterator<Item = char>) -> bool {
580 use BetweenSentenceParserState::*;
581
582 for char in chars {
583 #[cfg(debug_assertions)]
584 trace!("Parser state: {:?}", self.state);
585
586 match char {
587 c if c.is_whitespace() => match self.state {
588 SentenceEndingPunctuation(_) | FollowingPunctuation => {
589 self.state = Whitespace;
590 }
591 Whitespace => {}
592 _ => return false,
593 },
594 c if super::is_sentence_ending_punctuation(&c) => {
595 let r#type = match c {
596 '.' => EndingPunctuationType::NonMixable(c),
597 _ => EndingPunctuationType::Mixable,
598 };
599 match self.state {
600 Initial | PrecedingPunctuation => {
601 self.state = SentenceEndingPunctuation(r#type);
602 }
603 SentenceEndingPunctuation(EndingPunctuationType::Mixable)
604 if matches!(r#type, EndingPunctuationType::Mixable) => {}
605 SentenceEndingPunctuation(EndingPunctuationType::NonMixable(old_c))
606 if matches!(r#type, EndingPunctuationType::NonMixable(c) if c == old_c) =>
607 {}
608 _ => return false,
609 }
610 }
611 c if super::is_punctuation(&c) => match self.state {
612 Initial => {
613 self.state = PrecedingPunctuation;
614 }
615 PrecedingPunctuation | FollowingPunctuation | SentenceStartPunctuation => {}
616 SentenceEndingPunctuation(_) => {
617 self.state = FollowingPunctuation;
618 }
619 Whitespace => self.state = SentenceStartPunctuation,
620 },
621 _ => return false,
622 }
623 }
624
625 matches!(self.state, Whitespace | SentenceStartPunctuation)
626 }
627 }
628}
629
630pub(crate) mod extras {
631 use std::collections::VecDeque;
632
633 use super::*;
634
635 pub(crate) struct WordIteratorExtension<'a, I> {
636 prefix: Option<I>,
637 inner: WordIterator<'a>,
638 }
639
640 impl<'a, I> From<WordIterator<'a>> for WordIteratorExtension<'a, I> {
641 fn from(inner: WordIterator<'a>) -> Self {
642 Self {
643 prefix: None,
644 inner,
645 }
646 }
647 }
648
649 impl<'a, I> WordIteratorExtension<'a, I>
650 where
651 I: Iterator<Item = WordIteratorItem<'a>>,
652 {
653 pub(crate) fn extend_on_prefix(self, prefix: I) -> Self {
654 Self {
655 prefix: Some(prefix),
656 inner: self.into_inner().1,
657 }
658 }
659
660 pub(crate) fn into_inner(self) -> (Option<I>, WordIterator<'a>) {
661 (self.prefix, self.inner)
662 }
663 }
664
665 impl<'a, I> Iterator for WordIteratorExtension<'a, I>
666 where
667 I: Iterator<Item = WordIteratorItem<'a>>,
668 {
669 type Item = WordIteratorItem<'a>;
670
671 fn next(&mut self) -> Option<Self::Item> {
672 match self.prefix {
673 Some(ref mut prefix) => prefix.next().or_else(|| self.inner.next()),
674 None => self.inner.next(),
675 }
676 }
677 }
678
679 #[cfg(test)]
680 pub(crate) struct UnitIterator<'a> {
681 _marker: std::marker::PhantomData<&'a ()>,
682 }
683
684 #[cfg(test)]
685 impl<'a> Iterator for UnitIterator<'a> {
686 type Item = WordIteratorItem<'a>;
687
688 fn next(&mut self) -> Option<Self::Item> {
689 None
690 }
691 }
692
693 pub(crate) struct WordIteratorPrefix<'a> {
694 inner: VecDeque<WordIteratorItem<'a>>,
695 }
696
697 impl<'a> WordIteratorPrefix<'a> {
698 pub(crate) fn new<I>(inner: I) -> Self
699 where
700 I: IntoIterator<Item = WordIteratorItem<'a>>,
701 {
702 Self {
703 inner: inner.into_iter().collect(),
704 }
705 }
706 }
707
708 impl<'a> Iterator for WordIteratorPrefix<'a> {
709 type Item = WordIteratorItem<'a>;
710
711 fn next(&mut self) -> Option<Self::Item> {
712 self.inner.pop_front()
713 }
714 }
715}
716
717#[cfg(test)]
718mod tests {
719 use super::*;
720 use crop::Rope;
721
722 #[test]
723 fn test_word_iterator_basic() {
724 let rope = Rope::from("hello world");
725 let slice = rope.byte_slice(..);
726 let mut iter = WordIterator::new(slice, 0, Default::default());
727
728 let (offset, word, cap) = iter.next().unwrap();
729 assert_eq!(offset, 0);
730 assert_eq!(word.to_string(), "hello");
731 assert_eq!(cap, Capitalize::False);
732
733 let (offset, word, cap) = iter.next().unwrap();
734 assert_eq!(offset, 6);
735 assert_eq!(word.to_string(), "world");
736 assert_eq!(cap, Capitalize::False);
737
738 assert!(iter.next().is_none());
739 }
740
741 #[test]
742 fn test_word_iterator_with_punctuation() {
743 let rope = Rope::from("hello, world!");
744 let slice = rope.byte_slice(..);
745 let mut iter = WordIterator::new(slice, 0, Default::default());
746
747 let (offset, word, cap) = iter.next().unwrap();
748 assert_eq!(offset, 0);
749 assert_eq!(word.to_string(), "hello");
750 assert_eq!(cap, Capitalize::False);
751
752 let (offset, word, cap) = iter.next().unwrap();
753 assert_eq!(offset, 7);
754 assert_eq!(word.to_string(), "world");
755 assert_eq!(cap, Capitalize::False);
756
757 assert!(iter.next().is_none());
758 }
759
760 #[test]
761 fn test_word_iterator_with_multiple_spaces() {
762 let rope = Rope::from("hello world");
763 let slice = rope.byte_slice(..);
764 let mut iter = WordIterator::new(slice, 0, Default::default());
765
766 let (offset, word, cap) = iter.next().unwrap();
767 assert_eq!(offset, 0);
768 assert_eq!(word.to_string(), "hello");
769 assert_eq!(cap, Capitalize::False);
770
771 let (offset, word, cap) = iter.next().unwrap();
772 assert_eq!(offset, 8);
773 assert_eq!(word.to_string(), "world");
774 assert_eq!(cap, Capitalize::False);
775
776 assert!(iter.next().is_none());
777 }
778
779 #[test]
780 fn test_word_iterator_with_numbers() {
781 let rope = Rope::from("test123 456");
782 let slice = rope.byte_slice(..);
783 let mut iter = WordIterator::new(slice, 0, Default::default());
784
785 let (offset, word, cap) = iter.next().unwrap();
786 assert_eq!(offset, 0);
787 assert_eq!(word.to_string(), "test123");
788 assert_eq!(cap, Capitalize::False);
789
790 let (offset, word, cap) = iter.next().unwrap();
791 assert_eq!(offset, 8);
792 assert_eq!(word.to_string(), "456");
793 assert_eq!(cap, Capitalize::False);
794
795 assert!(iter.next().is_none());
796 }
797
798 #[test]
799 fn test_word_iterator_with_quotes() {
800 let rope = Rope::from("hello \"world\"");
801 let slice = rope.byte_slice(..);
802 let mut iter = WordIterator::new(slice, 0, Default::default());
803
804 let (offset, word, cap) = iter.next().unwrap();
805 assert_eq!(offset, 0);
806 assert_eq!(word.to_string(), "hello");
807 assert_eq!(cap, Capitalize::False);
808
809 let (offset, word, cap) = iter.next().unwrap();
810 assert_eq!(offset, 7);
811 assert_eq!(word.to_string(), "world");
812 assert_eq!(cap, Capitalize::False);
813
814 assert!(iter.next().is_none());
815 }
816
817 #[test]
818 fn test_word_iterator_include_hyphen_on_bare_prefixes() {
819 let rope = Rope::from("pre- and post-world");
820 let slice = rope.byte_slice(..);
821 let mut iter = WordIterator::new(slice, 0, Default::default());
822
823 let (offset, word, _cap) = iter.next().unwrap();
824 assert_eq!(offset, 0);
825 assert_eq!(word.to_string(), "pre-");
826
827 let (offset, word, _cap) = iter.nth(1).unwrap();
828 assert_eq!(offset, 9);
829 assert_eq!(word.to_string(), "post-world");
830 }
831
832 #[test]
833 fn test_word_iterator_with_emoji() {
834 let rope = Rope::from("hello 🤝 world");
835 let slice = rope.byte_slice(..);
836 let mut iter = WordIterator::new(slice, 0, Default::default());
837
838 let (offset, word, cap) = iter.next().unwrap();
839 assert_eq!(offset, 0);
840 assert_eq!(word.to_string(), "hello");
841 assert_eq!(cap, Capitalize::False);
842
843 let (offset, word, cap) = iter.next().unwrap();
844 assert_eq!(offset, 6);
845 assert_eq!(word.to_string(), "🤝");
846 assert_eq!(cap, Capitalize::False);
847
848 let (offset, word, cap) = iter.next().unwrap();
849 assert_eq!(offset, 11);
850 assert_eq!(word.to_string(), "world");
851 assert_eq!(cap, Capitalize::False);
852
853 assert!(iter.next().is_none());
854 }
855
856 #[test]
857 fn test_word_iterator_with_cjk() {
858 let rope = Rope::from("hello 你好 world");
859 let slice = rope.byte_slice(..);
860 let mut iter = WordIterator::new(slice, 0, Default::default());
861
862 let (offset, word, cap) = iter.next().unwrap();
863 assert_eq!(offset, 0);
864 assert_eq!(word.to_string(), "hello");
865 assert_eq!(cap, Capitalize::False);
866
867 let (offset, word, cap) = iter.next().unwrap();
868 assert_eq!(offset, 6);
869 assert_eq!(word.to_string(), "你好");
870 assert_eq!(cap, Capitalize::False);
871
872 let (offset, word, cap) = iter.next().unwrap();
873 assert_eq!(offset, 13);
874 assert_eq!(word.to_string(), "world");
875 assert_eq!(cap, Capitalize::False);
876
877 assert!(iter.next().is_none());
878 }
879
880 #[test]
881 fn test_word_iterator_initial_capitalization() {
882 let rope = Rope::from("hello world");
883 let slice = rope.byte_slice(..);
884 let mut iter = WordIterator::new(
885 slice,
886 0,
887 WordIteratorOptions {
888 initial_capitalize: Capitalize::True,
889 ..Default::default()
890 },
891 );
892
893 let (offset, word, cap) = iter.next().unwrap();
894 assert_eq!(offset, 0);
895 assert_eq!(word.to_string(), "hello");
896 assert_eq!(cap, Capitalize::True);
897
898 let (offset, word, cap) = iter.next().unwrap();
899 assert_eq!(offset, 6);
900 assert_eq!(word.to_string(), "world");
901 assert_eq!(cap, Capitalize::False);
902
903 assert!(iter.next().is_none());
904 }
905
906 #[test]
907 fn test_word_iterator_subsequent_capitalization() {
908 let rope = Rope::from("some thing. Sentence. World.");
909 let slice = rope.byte_slice(..);
910 let mut iter = WordIterator::new(slice, 0, Default::default());
911
912 let (offset, word, cap) = iter.nth(2).unwrap();
913 assert_eq!(offset, 12);
914 assert_eq!(word.to_string(), "Sentence");
915 assert_eq!(cap, Capitalize::True);
916
917 let (offset, word, cap) = iter.next().unwrap();
918 assert_eq!(offset, 22);
919 assert_eq!(word.to_string(), "World");
920 assert_eq!(cap, Capitalize::True);
921
922 assert!(iter.next().is_none());
923 }
924
925 #[test]
926 fn test_word_iterator_break_on_hyphens() {
927 let rope = Rope::from("hello-world");
928 let slice = rope.byte_slice(..);
929 let mut iter = WordIterator::new(slice, 0, Default::default());
930
931 let (_offset, word, _cap) = iter.next().unwrap();
932 assert_eq!(word.to_string(), "hello-world");
933 assert!(iter.next().is_none());
934
935 let slice = rope.byte_slice(..);
936 let mut iter = WordIterator::new(
937 slice,
938 0,
939 WordIteratorOptions {
940 break_on_punctuation: BreakOnPunctuation::Hyphen,
941 ..Default::default()
942 },
943 );
944
945 let (offset, word, _cap) = iter.next().unwrap();
946 assert_eq!(offset, 0);
947 assert_eq!(word.to_string(), "hello");
948
949 let (offset, word, _cap) = iter.next().unwrap();
950 assert_eq!(offset, 6);
951 assert_eq!(word.to_string(), "world");
952
953 assert!(iter.next().is_none());
954 }
955
956 #[test]
957 fn test_word_iterator_capitalize_on_colons() {
958 let rope = Rope::from("hello: world");
959 let slice = rope.byte_slice(..);
960 let mut iter = WordIterator::new(slice, 0, Default::default());
961
962 let (_offset, word, cap) = iter.next().unwrap();
963 assert_eq!(word.to_string(), "hello");
964 assert_eq!(cap, Capitalize::False);
965
966 let (_offset, word, cap) = iter.next().unwrap();
967 assert_eq!(word.to_string(), "world");
968 assert_eq!(cap, Capitalize::False);
969
970 let slice = rope.byte_slice(..);
971 let mut iter = WordIterator::new(
972 slice,
973 0,
974 WordIteratorOptions {
975 capitalize_trigger_punctuation: CapitalizeTriggerPunctuation::PlusColon,
976 ..Default::default()
977 },
978 );
979
980 let (_offset, word, cap) = iter.next().unwrap();
981 assert_eq!(word.to_string(), "hello");
982 assert_eq!(cap, Capitalize::False);
983
984 let (_offset, word, cap) = iter.next().unwrap();
985 assert_eq!(word.to_string(), "world");
986 assert_eq!(cap, Capitalize::True);
987 }
988
989 #[test]
990 fn test_word_iterator_complex_sentence() {
991 let rope = Rope::from(
992 "Each of these open source tools are amazing, but they all had a major drawback - we couldn't use Postgres as the server's datastore. If you haven't noticed yet, our team likes Postgres a lot 😉."
993 );
994 let slice = rope.byte_slice(..);
995
996 let iter = WordIterator::new(slice, 0, Default::default());
997 let mut offsets: Vec<usize> = Vec::new();
998 let mut words: Vec<String> = Vec::new();
999 let mut caps: Vec<Capitalize> = Vec::new();
1000
1001 for (offset, word, cap) in iter {
1002 offsets.push(offset);
1003 words.push(word.to_string());
1004 caps.push(cap);
1005 }
1006
1007 assert_eq!(
1008 offsets,
1009 vec![
1010 0, 5, 8, 14, 19, 26, 32, 36, 45, 49, 54, 58, 62, 64, 70, 81, 84, 93, 97, 106, 109,
1011 113, 122, 133, 136, 140, 148, 156, 161, 165, 170, 176, 185, 187, 191
1012 ]
1013 );
1014 assert_eq!(
1015 words,
1016 vec![
1017 "Each",
1018 "of",
1019 "these",
1020 "open",
1021 "source",
1022 "tools",
1023 "are",
1024 "amazing",
1025 "but",
1026 "they",
1027 "all",
1028 "had",
1029 "a",
1030 "major",
1031 "drawback",
1032 "we",
1033 "couldn't",
1034 "use",
1035 "Postgres",
1036 "as",
1037 "the",
1038 "server's",
1039 "datastore",
1040 "If",
1041 "you",
1042 "haven't",
1043 "noticed",
1044 "yet",
1045 "our",
1046 "team",
1047 "likes",
1048 "Postgres",
1049 "a",
1050 "lot",
1051 "😉"
1052 ]
1053 );
1054 for (idx, cap) in caps.iter().enumerate() {
1055 assert_eq!(
1056 *cap,
1057 if idx == 23 {
1058 Capitalize::True
1059 } else {
1060 Capitalize::False
1061 }
1062 );
1063 }
1064 }
1065
1066 #[test]
1067 fn test_word_iterator_collect_remainder() {
1068 let rope = Rope::from("hello everybody in the world");
1069 let slice = rope.byte_slice(..);
1070 let mut iter = WordIterator::new(slice, 0, Default::default());
1071
1072 iter.next();
1073 assert_eq!(
1074 iter.collect_remainder(),
1075 Some("everybody in the world".to_string())
1076 );
1077 }
1078
1079 #[test]
1080 fn test_word_iterator_no_remainder() {
1081 let rope = Rope::from("hello");
1082 let slice = rope.byte_slice(..);
1083 let mut iter = WordIterator::new(slice, 0, Default::default());
1084
1085 iter.next();
1086 assert!(iter.collect_remainder().is_none());
1087 }
1088
1089 #[test]
1090 fn test_word_iterator_wrapper() {
1091 let rope = Rope::from("hello world");
1092 let slice = rope.byte_slice(..);
1093 let mut iter: extras::WordIteratorExtension<'_, extras::UnitIterator> =
1094 WordIterator::new(slice, 0, Default::default()).into();
1095
1096 let (offset, word, cap) = iter.next().unwrap();
1097 assert_eq!(offset, 0);
1098 assert_eq!(word.to_string(), "hello");
1099 assert_eq!(cap, Capitalize::False);
1100
1101 let (offset, word, cap) = iter.next().unwrap();
1102 assert_eq!(offset, 6);
1103 assert_eq!(word.to_string(), "world");
1104 assert_eq!(cap, Capitalize::False);
1105
1106 assert!(iter.next().is_none());
1107 }
1108
1109 #[test]
1110 fn test_word_iterator_wrapper_with_prefix() {
1111 let rope = Rope::from("hello world keep going");
1112 let slice = rope.byte_slice(..);
1113
1114 let mut orig_iter: extras::WordIteratorExtension<'_, extras::WordIteratorPrefix> =
1115 WordIterator::new(slice, 0, Default::default()).into();
1116
1117 let mut consumed = vec![];
1118 consumed.push(orig_iter.next().unwrap());
1119 consumed.push(orig_iter.next().unwrap());
1120
1121 let mut new_iter = orig_iter.extend_on_prefix(extras::WordIteratorPrefix::new(consumed));
1122
1123 let next = new_iter.next().unwrap();
1124 assert_eq!(next.0, 0);
1125 assert_eq!(next.1.to_string(), "hello");
1126 let next = new_iter.next().unwrap();
1127 assert_eq!(next.0, 6);
1128 assert_eq!(next.1.to_string(), "world");
1129 let next = new_iter.next().unwrap();
1130 assert_eq!(next.0, 12);
1131 assert_eq!(next.1.to_string(), "keep");
1132 let next = new_iter.next().unwrap();
1133 assert_eq!(next.0, 17);
1134 assert_eq!(next.1.to_string(), "going");
1135 assert!(new_iter.next().is_none());
1136 }
1137
1138 #[test]
1139 fn test_is_sentence_start() {
1140 let rope = Rope::from("Hello world! What a wonderful day. What's up?");
1141 assert!(is_sentence_start()
1142 .slice(rope.byte_slice(..))
1143 .query_offset(0)
1144 .call());
1145 assert!(is_sentence_start()
1146 .slice(rope.byte_slice(..))
1147 .query_offset(13)
1148 .call());
1149 assert!(is_sentence_start()
1150 .slice(rope.byte_slice(..))
1151 .query_offset(35)
1152 .call());
1153 assert!(!is_sentence_start()
1154 .slice(rope.byte_slice(..))
1155 .query_offset(6)
1156 .call());
1157 assert!(!is_sentence_start()
1158 .slice(rope.byte_slice(..))
1159 .query_offset(11)
1160 .call());
1161 assert!(!is_sentence_start()
1162 .slice(rope.byte_slice(..))
1163 .query_offset(12)
1164 .call());
1165 assert!(!is_sentence_start()
1166 .slice(rope.byte_slice(..))
1167 .query_offset(40)
1168 .call());
1169 }
1170
1171 #[test]
1172 fn test_is_sentence_start_handles_ellipsis() {
1173 let rope = Rope::from("Hello... world!");
1174 assert!(!is_sentence_start()
1175 .slice(rope.byte_slice(..))
1176 .query_offset(9)
1177 .call());
1178
1179 let rope = Rope::from("Hello... World!");
1180 assert!(is_sentence_start()
1181 .slice(rope.byte_slice(..))
1182 .query_offset(9)
1183 .call());
1184 }
1185
1186 #[test]
1187 fn test_is_sentence_start_handles_mixed_punctuation() {
1188 let rope = Rope::from("Hello?!?!?! World!");
1189 assert!(is_sentence_start()
1190 .slice(rope.byte_slice(..))
1191 .query_offset(12)
1192 .call());
1193
1194 let rope = Rope::from("Hello.!?. What?");
1195 assert!(!is_sentence_start()
1196 .slice(rope.byte_slice(..))
1197 .query_offset(10)
1198 .call());
1199 }
1200
1201 #[test]
1202 fn test_is_sentence_start_gracefully_fails_on_empty_rope() {
1203 assert!(!is_sentence_start()
1204 .slice(Rope::from("").byte_slice(..))
1205 .query_offset(0)
1206 .call());
1207 }
1208
1209 #[test]
1210 fn test_is_sentence_start_gracefully_fails_on_out_of_bounds() {
1211 assert!(!is_sentence_start()
1212 .slice(Rope::from("Hello").byte_slice(..))
1213 .query_offset(1000)
1214 .call());
1215 }
1216}