MCPcopy Index your code
hub / github.com/formatjs/formatjs / isSegmentWordLike

Function isSegmentWordLike

packages/intl-segmenter/segmenter.ts:311–369  ·  view source on GitHub ↗

* Determines if a segment is word-like according to Unicode Word Break rules. * * A segment is considered word-like if it contains alphabetic characters, * numbers, or ideographs. Segments containing only whitespace, punctuation, * or symbols are not word-like. * * Per Unicode Word Break (UAX

(segment: string, matchingRule: string)

Source from the content-addressed store, hash-verified

309 * @returns true if the segment is word-like
310 */
311function isSegmentWordLike(segment: string, matchingRule: string): boolean {
312 // Primary check: Does the segment contain word characters?
313 // Word-like segments contain letters (including ideographs), numbers,
314 // or connecting characters like apostrophes within words
315 //
316 // Regex matches:
317 // - Letters: \p{L} (all Unicode letters)
318 // - Numbers: \p{N} (all Unicode numbers)
319 // - Marks: \p{M} (combining marks, typically part of letters)
320 //
321 // Note: Using Unicode property escapes which work in modern JS engines
322 // and are necessary for proper internationalization
323
324 // Lazy-initialize Unicode regex on first use
325 if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
326 try {
327 // Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
328 WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u')
329 } catch {
330 // Environment doesn't support Unicode property escapes
331 WORD_CHARACTERS_UNICODE_REGEX = null
332 }
333 }
334
335 let hasWordCharacters: boolean
336 if (WORD_CHARACTERS_UNICODE_REGEX) {
337 // Check if segment contains word characters using Unicode property escapes
338 // This matches the behavior of native Intl.Segmenter in Chrome/Firefox
339 hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment)
340 } else {
341 // Fallback for environments without Unicode property escapes
342 // Match basic word characters: letters, numbers, underscores
343 hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment)
344 }
345
346 // If segment contains word characters, it's word-like
347 if (hasWordCharacters) {
348 return true
349 }
350
351 // If no word characters, check if it's definitely not word-like via rules
352 // Non-word-like rules per Unicode Word Break specification (UAX #29):
353 // https://unicode.org/reports/tr29/#Word_Boundaries
354 //
355 // WB3a (3.1): Break before newlines (sot ÷ (Newline | CR | LF))
356 // WB3b (3.2): Break after newlines ((Newline | CR | LF) ÷ eot)
357 // WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
358 //
359 // These rules specifically identify non-word segments like line breaks and whitespace
360 const definitelyNotWordLikeRules = ['3.1', '3.2', '3.4']
361
362 if (definitelyNotWordLikeRules.includes(matchingRule)) {
363 return false
364 }
365
366 // For segments without word characters and not matching specific non-word rules,
367 // return false (e.g., punctuation, symbols, whitespace via rule 999)
368 return false

Callers 1

createSegmentDataObjectFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected