* Get word boundaries using Intl.Segmenter for proper Unicode word segmentation. * This correctly handles CJK (Chinese, Japanese, Korean) text where each character * is typically its own word, as well as scripts that use spaces between words.
()
| 1171 | * is typically its own word, as well as scripts that use spaces between words. |
| 1172 | */ |
| 1173 | public getWordBoundaries(): Array<{ |
| 1174 | start: number |
| 1175 | end: number |
| 1176 | isWordLike: boolean |
| 1177 | }> { |
| 1178 | if (!this.wordBoundariesCache) { |
| 1179 | this.wordBoundariesCache = [] |
| 1180 | for (const segment of getWordSegmenter().segment(this.text)) { |
| 1181 | this.wordBoundariesCache.push({ |
| 1182 | start: segment.index, |
| 1183 | end: segment.index + segment.segment.length, |
| 1184 | isWordLike: segment.isWordLike ?? false, |
| 1185 | }) |
| 1186 | } |
| 1187 | } |
| 1188 | return this.wordBoundariesCache |
| 1189 | } |
| 1190 | |
| 1191 | /** |
| 1192 | * Binary search for boundaries. |