* Get word boundaries using Intl.Segmenter for proper Unicode word segmentation. * This correctly handles CJK (Chinese, Japanese, Korean) text where each character * is typically its own word, as well as scripts that use spaces between words.
()
| 1146 | * is typically its own word, as well as scripts that use spaces between words. |
| 1147 | */ |
| 1148 | public getWordBoundaries(): Array<{ |
| 1149 | start: number |
| 1150 | end: number |
| 1151 | isWordLike: boolean |
| 1152 | }> { |
| 1153 | if (!this.wordBoundariesCache) { |
| 1154 | this.wordBoundariesCache = [] |
| 1155 | for (const segment of getWordSegmenter().segment(this.text)) { |
| 1156 | this.wordBoundariesCache.push({ |
| 1157 | start: segment.index, |
| 1158 | end: segment.index + segment.segment.length, |
| 1159 | isWordLike: segment.isWordLike ?? false, |
| 1160 | }) |
| 1161 | } |
| 1162 | } |
| 1163 | return this.wordBoundariesCache |
| 1164 | } |
| 1165 | |
| 1166 | /** |
| 1167 | * Binary search for boundaries. |