* Determines if a segment is word-like according to Unicode Word Break rules. * * A segment is considered word-like if it contains alphabetic characters, * numbers, or ideographs. Segments containing only whitespace, punctuation, * or symbols are not word-like. * * Per Unicode Word Break (UAX
(segment: string, matchingRule: string)
| 309 | * @returns true if the segment is word-like |
| 310 | */ |
| 311 | function isSegmentWordLike(segment: string, matchingRule: string): boolean { |
| 312 | // Primary check: Does the segment contain word characters? |
| 313 | // Word-like segments contain letters (including ideographs), numbers, |
| 314 | // or connecting characters like apostrophes within words |
| 315 | // |
| 316 | // Regex matches: |
| 317 | // - Letters: \p{L} (all Unicode letters) |
| 318 | // - Numbers: \p{N} (all Unicode numbers) |
| 319 | // - Marks: \p{M} (combining marks, typically part of letters) |
| 320 | // |
| 321 | // Note: Using Unicode property escapes which work in modern JS engines |
| 322 | // and are necessary for proper internationalization |
| 323 | |
| 324 | // Lazy-initialize Unicode regex on first use |
| 325 | if (WORD_CHARACTERS_UNICODE_REGEX === undefined) { |
| 326 | try { |
| 327 | // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error |
| 328 | WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u') |
| 329 | } catch { |
| 330 | // Environment doesn't support Unicode property escapes |
| 331 | WORD_CHARACTERS_UNICODE_REGEX = null |
| 332 | } |
| 333 | } |
| 334 | |
| 335 | let hasWordCharacters: boolean |
| 336 | if (WORD_CHARACTERS_UNICODE_REGEX) { |
| 337 | // Check if segment contains word characters using Unicode property escapes |
| 338 | // This matches the behavior of native Intl.Segmenter in Chrome/Firefox |
| 339 | hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment) |
| 340 | } else { |
| 341 | // Fallback for environments without Unicode property escapes |
| 342 | // Match basic word characters: letters, numbers, underscores |
| 343 | hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment) |
| 344 | } |
| 345 | |
| 346 | // If segment contains word characters, it's word-like |
| 347 | if (hasWordCharacters) { |
| 348 | return true |
| 349 | } |
| 350 | |
| 351 | // If no word characters, check if it's definitely not word-like via rules |
| 352 | // Non-word-like rules per Unicode Word Break specification (UAX #29): |
| 353 | // https://unicode.org/reports/tr29/#Word_Boundaries |
| 354 | // |
| 355 | // WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF)) |
| 356 | // WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot) |
| 357 | // WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace) |
| 358 | // |
| 359 | // These rules specifically identify non-word segments like line breaks and whitespace |
| 360 | const definitelyNotWordLikeRules = ['3.1', '3.2', '3.4'] |
| 361 | |
| 362 | if (definitelyNotWordLikeRules.includes(matchingRule)) { |
| 363 | return false |
| 364 | } |
| 365 | |
| 366 | // For segments without word characters and not matching specific non-word rules, |
| 367 | // return false (e.g., punctuation, symbols, whitespace via rule 999) |
| 368 | return false |
no outgoing calls
no test coverage detected