(html: string)
| 53 | * spec examples depend on. |
| 54 | */ |
| 55 | export function normalizeHtml(html: string): string { |
| 56 | let out = html; |
| 57 | |
| 58 | // Normalise self-closing void elements to `<br />` style consistently. |
| 59 | const voidTags = ['br', 'hr', 'img', 'wbr', 'input']; |
| 60 | for (const tag of voidTags) { |
| 61 | const re = new RegExp(`<${tag}((?:\\s+[^>]*?)?)\\s*/?\\s*>`, 'gi'); |
| 62 | out = out.replace(re, (_m, attrs: string) => { |
| 63 | const trimmed = attrs.trim(); |
| 64 | return trimmed ? `<${tag} ${trimmed} />` : `<${tag} />`; |
| 65 | }); |
| 66 | } |
| 67 | |
| 68 | // Sort attributes inside tags alphabetically. Skip closing tags and |
| 69 | // self-closed void tags we already handled. |
| 70 | // |
| 71 | // The attrs portion is anchored to start with a valid attr-name char |
| 72 | // (`[a-z_:]` after the `i` flag). That excludes the `/` of `<br />`, |
| 73 | // which would otherwise be captured as attrs and dropped by the |
| 74 | // attr-name tokenizer, undoing the void-tag normalisation above. The |
| 75 | // `[ \t\n\r]+` separator (instead of `\s+`) plus this anchor also |
| 76 | // prevents polynomial backtracking against the trailing `[^>]*?`. |
| 77 | out = out.replace( |
| 78 | /<([a-z][\w-]*)[ \t\n\r]+([a-z_:][^>]*?)(\/?)>/gi, |
| 79 | (_m, name: string, attrs: string, selfClose: string) => { |
| 80 | // Parse attrs of the form: name | name="value" | name='value' |
| 81 | const parsed: Array<[string, string]> = []; |
| 82 | const re = /([a-z_:][\w:.-]*)(?:\s*=\s*("[^"]*"|'[^']*'|[^\s"'>`]+))?/gi; |
| 83 | let m: RegExpExecArray | null; |
| 84 | // eslint-disable-next-line no-cond-assign |
| 85 | while ((m = re.exec(attrs))) { |
| 86 | parsed.push([m[1], m[2] ?? '']); |
| 87 | } |
| 88 | parsed.sort(([a], [b]) => (a < b ? -1 : a > b ? 1 : 0)); |
| 89 | const rebuilt = parsed |
| 90 | .map(([k, v]) => (v === '' ? k : `${k}=${v}`)) |
| 91 | .join(' '); |
| 92 | const sc = selfClose ? ' /' : ''; |
| 93 | return rebuilt |
| 94 | ? `<${name} ${rebuilt}${sc}>` |
| 95 | : `<${name}${sc}>`; |
| 96 | }, |
| 97 | ); |
| 98 | |
| 99 | // Collapse whitespace between adjacent tag boundaries. Crucially: |
| 100 | // `>foo\n<` does NOT match because `foo` isn't whitespace — content is |
| 101 | // preserved. Only whitespace-only `>WS<` collapses. |
| 102 | out = out.replace(/>[ \t\n\r]+</g, '><'); |
| 103 | |
| 104 | // Strip whitespace immediately following a void self-closing tag. This |
| 105 | // covers cmark's `<br>\nfoo` vs marked's `<br>foo`: after the void-tag |
| 106 | // normalisation above both inputs reach `<br />`, then we strip the |
| 107 | // following whitespace run. |
| 108 | out = out.replace( |
| 109 | /(<(?:br|hr|wbr|img|input)\s+\/>)[ \t\n\r]+/gi, |
| 110 | '$1', |
| 111 | ); |
| 112 |
no test coverage detected