3 strategies for publishing date extraction. The strategies are descending in accuracy and the next strategy is only attempted if a preferred one fails. 1. Pubdate from URL 2. Pubdate from metadata 3. Raw regex searches in the HTML + added heuristics
(self, url, doc)
| 170 | # return authors |
| 171 | |
| 172 | def get_publishing_date(self, url, doc): |
| 173 | """3 strategies for publishing date extraction. The strategies |
| 174 | are descending in accuracy and the next strategy is only |
| 175 | attempted if a preferred one fails. |
| 176 | |
| 177 | 1. Pubdate from URL |
| 178 | 2. Pubdate from metadata |
| 179 | 3. Raw regex searches in the HTML + added heuristics |
| 180 | """ |
| 181 | |
| 182 | def parse_date_str(date_str): |
| 183 | try: |
| 184 | datetime_obj = date_parser(date_str) |
| 185 | return datetime_obj |
| 186 | except: |
| 187 | # near all parse failures are due to URL dates without a day |
| 188 | # specifier, e.g. /2014/04/ |
| 189 | return None |
| 190 | |
| 191 | date_match = re.search(urls.DATE_REGEX, url) |
| 192 | if date_match: |
| 193 | date_str = date_match.group(0) |
| 194 | datetime_obj = parse_date_str(date_str) |
| 195 | if datetime_obj: |
| 196 | return datetime_obj |
| 197 | |
| 198 | PUBLISH_DATE_TAGS = [ |
| 199 | {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, |
| 200 | {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'}, |
| 201 | {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'}, |
| 202 | {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'}, |
| 203 | {'attribute': 'property', 'value': 'og:published_time', 'content': 'content'}, |
| 204 | {'attribute': 'name', 'value': 'article_date_original', 'content': 'content'}, |
| 205 | {'attribute': 'name', 'value': 'publication_date', 'content': 'content'}, |
| 206 | {'attribute': 'name', 'value': 'sailthru.date', 'content': 'content'}, |
| 207 | {'attribute': 'name', 'value': 'PublishDate', 'content': 'content'}, |
| 208 | ] |
| 209 | for known_meta_tag in PUBLISH_DATE_TAGS: |
| 210 | meta_tags = self.parser.getElementsByTag( |
| 211 | doc, |
| 212 | attr=known_meta_tag['attribute'], |
| 213 | value=known_meta_tag['value']) |
| 214 | if meta_tags: |
| 215 | date_str = self.parser.getAttribute( |
| 216 | meta_tags[0], |
| 217 | known_meta_tag['content']) |
| 218 | datetime_obj = parse_date_str(date_str) |
| 219 | if datetime_obj: |
| 220 | return datetime_obj |
| 221 | |
| 222 | return None |
| 223 | |
| 224 | def get_title(self, doc): |
| 225 | """Fetch the article title and analyze it |
no test coverage detected