* Given a SAX-like `HtmlSaxHandler` parses a * `htmlText` and lets the `handler` know the structure while * visiting the nodes. If the provided handler is an implementation of * `htmlparser.HtmlSaxHandlerWithLocation`, then its * `setDocLocator` method will get called prior to * `star
(handler, htmlText)
| 512 | * @param {string} htmlText The html text. |
| 513 | */ |
| 514 | parse(handler, htmlText) { |
| 515 | let htmlUpper = null; |
| 516 | let inTag = false; // True iff we're currently processing a tag. |
| 517 | const attribs = []; // Accumulates attribute names and values. |
| 518 | let tagName; // The name of the tag currently being processed. |
| 519 | let eflags; // The element flags for the current tag. |
| 520 | let openTag; // True if the current tag is an open tag. |
| 521 | const tagStack = new TagNameStack(handler); |
| 522 | |
| 523 | // Only provide location information if the handler implements the |
| 524 | // setDocLocator method. |
| 525 | let locator = null; |
| 526 | if (handler instanceof parserInterface.HtmlSaxHandlerWithLocation) { |
| 527 | locator = new DocLocatorImpl(htmlText); |
| 528 | handler.setDocLocator(locator); |
| 529 | } |
| 530 | |
| 531 | // Lets the handler know that we are starting to parse the document. |
| 532 | handler.startDoc(); |
| 533 | |
| 534 | // Consumes tokens from the htmlText and stops once all tokens are |
| 535 | // processed. |
| 536 | while (htmlText) { |
| 537 | const regex = inTag ? INSIDE_TAG_TOKEN_ : OUTSIDE_TAG_TOKEN_; |
| 538 | // Gets the next token |
| 539 | const m = htmlText.match(regex); |
| 540 | if (locator) { |
| 541 | locator.advancePos(m[0]); |
| 542 | } |
| 543 | // And removes it from the string |
| 544 | htmlText = htmlText.substring(m[0].length); |
| 545 | |
| 546 | // TODO(goto): cleanup this code breaking it into separate methods. |
| 547 | if (inTag) { |
| 548 | if (m[1]) { // Attribute. |
| 549 | // SetAttribute with uppercase names doesn't work on IE6. |
| 550 | const attribName = parserInterface.toLowerCase(m[1]); |
| 551 | // Use empty string as value for valueless attribs, so |
| 552 | // <input type=checkbox checked> |
| 553 | // gets attributes ['type', 'checkbox', 'checked', ''] |
| 554 | let decodedValue = ''; |
| 555 | if (m[2]) { |
| 556 | let encodedValue = m[3]; |
| 557 | switch (encodedValue.charCodeAt(0)) { // Strip quotes. |
| 558 | case 34: // double quote " |
| 559 | case 39: // single quote ' |
| 560 | encodedValue = |
| 561 | encodedValue.substring(1, encodedValue.length - 1); |
| 562 | break; |
| 563 | } |
| 564 | decodedValue = |
| 565 | this.unescapeEntities_(this.stripNULs_(encodedValue)); |
| 566 | } |
| 567 | attribs.push(attribName, decodedValue); |
| 568 | } else if (m[4]) { |
| 569 | if (eflags !== void 0) { // False if not in allowlist. |
| 570 | if (openTag) { |
| 571 | tagStack.startTag(new parserInterface.ParsedHtmlTag( |
no test coverage detected