| 134 | } |
| 135 | |
| 136 | func GetHTMLMeta(urlStr string) (*HTMLMeta, error) { |
| 137 | if err := validateURL(urlStr); err != nil { |
| 138 | return nil, err |
| 139 | } |
| 140 | |
| 141 | response, err := httpClient.Get(urlStr) |
| 142 | if err != nil { |
| 143 | return nil, err |
| 144 | } |
| 145 | defer response.Body.Close() |
| 146 | |
| 147 | mediatype, err := getMediatype(response) |
| 148 | if err != nil { |
| 149 | return nil, err |
| 150 | } |
| 151 | if mediatype != "text/html" { |
| 152 | return nil, errors.New("not a HTML page") |
| 153 | } |
| 154 | |
| 155 | htmlMeta := extractHTMLMeta(io.LimitReader(response.Body, maxHTMLMetaBytes)) |
| 156 | enrichSiteMeta(response.Request.URL, htmlMeta) |
| 157 | return htmlMeta, nil |
| 158 | } |
| 159 | |
| 160 | func extractHTMLMeta(resp io.Reader) *HTMLMeta { |
| 161 | tokenizer := html.NewTokenizer(resp) |