hub / github.com/formatjs/formatjs / isSegmentWordLike

Function isSegmentWordLike

packages/intl-segmenter/segmenter.ts:311–369 · view source on GitHub ↗

* Determines if a segment is word-like according to Unicode Word Break rules. * * A segment is considered word-like if it contains alphabetic characters, * numbers, or ideographs. Segments containing only whitespace, punctuation, * or symbols are not word-like. * * Per Unicode Word Break (UAX

(segment: string, matchingRule: string)

Source from the content-addressed store, hash-verified

309	* @returns true if the segment is word-like
310	*/
311	function isSegmentWordLike(segment: string, matchingRule: string): boolean {
312	// Primary check: Does the segment contain word characters?
313	// Word-like segments contain letters (including ideographs), numbers,
314	// or connecting characters like apostrophes within words
315	//
316	// Regex matches:
317	// - Letters: \p{L} (all Unicode letters)
318	// - Numbers: \p{N} (all Unicode numbers)
319	// - Marks: \p{M} (combining marks, typically part of letters)
320	//
321	// Note: Using Unicode property escapes which work in modern JS engines
322	// and are necessary for proper internationalization
323
324	// Lazy-initialize Unicode regex on first use
325	if (WORD_CHARACTERS_UNICODE_REGEX === undefined) {
326	try {
327	// Create Unicode property escape regex at runtime to avoid compile-time TS1501 error
328	WORD_CHARACTERS_UNICODE_REGEX = new RegExp('[\\p{L}\\p{N}\\p{M}]', 'u')
329	} catch {
330	// Environment doesn't support Unicode property escapes
331	WORD_CHARACTERS_UNICODE_REGEX = null
332	}
333	}
334
335	let hasWordCharacters: boolean
336	if (WORD_CHARACTERS_UNICODE_REGEX) {
337	// Check if segment contains word characters using Unicode property escapes
338	// This matches the behavior of native Intl.Segmenter in Chrome/Firefox
339	hasWordCharacters = WORD_CHARACTERS_UNICODE_REGEX.test(segment)
340	} else {
341	// Fallback for environments without Unicode property escapes
342	// Match basic word characters: letters, numbers, underscores
343	hasWordCharacters = WORD_CHARACTERS_BASIC_REGEX.test(segment)
344	}
345
346	// If segment contains word characters, it's word-like
347	if (hasWordCharacters) {
348	return true
349	}
350
351	// If no word characters, check if it's definitely not word-like via rules
352	// Non-word-like rules per Unicode Word Break specification (UAX #29):
353	// https://unicode.org/reports/tr29/#Word_Boundaries
354	//
355	// WB3a (3.1): Break before newlines (sot ÷ (Newline \| CR \| LF))
356	// WB3b (3.2): Break after newlines ((Newline \| CR \| LF) ÷ eot)
357	// WB3d (3.4): Keep horizontal whitespace together (WSegSpace × WSegSpace)
358	//
359	// These rules specifically identify non-word segments like line breaks and whitespace
360	const definitelyNotWordLikeRules = ['3.1', '3.2', '3.4']
361
362	if (definitelyNotWordLikeRules.includes(matchingRule)) {
363	return false
364	}
365
366	// For segments without word characters and not matching specific non-word rules,
367	// return false (e.g., punctuation, symbols, whitespace via rule 999)
368	return false

Callers 1

createSegmentDataObjectFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected