MCPcopy
hub / github.com/codelucas/newspaper / get_publishing_date

Method get_publishing_date

newspaper/extractors.py:172–222  ·  view source on GitHub ↗

3 strategies for publishing date extraction. The strategies are descending in accuracy and the next strategy is only attempted if a preferred one fails. 1. Pubdate from URL 2. Pubdate from metadata 3. Raw regex searches in the HTML + added heuristics

(self, url, doc)

Source from the content-addressed store, hash-verified

170 # return authors
171
172 def get_publishing_date(self, url, doc):
173 """3 strategies for publishing date extraction. The strategies
174 are descending in accuracy and the next strategy is only
175 attempted if a preferred one fails.
176
177 1. Pubdate from URL
178 2. Pubdate from metadata
179 3. Raw regex searches in the HTML + added heuristics
180 """
181
182 def parse_date_str(date_str):
183 try:
184 datetime_obj = date_parser(date_str)
185 return datetime_obj
186 except:
187 # near all parse failures are due to URL dates without a day
188 # specifier, e.g. /2014/04/
189 return None
190
191 date_match = re.search(urls.DATE_REGEX, url)
192 if date_match:
193 date_str = date_match.group(0)
194 datetime_obj = parse_date_str(date_str)
195 if datetime_obj:
196 return datetime_obj
197
198 PUBLISH_DATE_TAGS = [
199 {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'},
200 {'attribute': 'property', 'value': 'article:published_time', 'content': 'content'},
201 {'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'},
202 {'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'},
203 {'attribute': 'property', 'value': 'og:published_time', 'content': 'content'},
204 {'attribute': 'name', 'value': 'article_date_original', 'content': 'content'},
205 {'attribute': 'name', 'value': 'publication_date', 'content': 'content'},
206 {'attribute': 'name', 'value': 'sailthru.date', 'content': 'content'},
207 {'attribute': 'name', 'value': 'PublishDate', 'content': 'content'},
208 ]
209 for known_meta_tag in PUBLISH_DATE_TAGS:
210 meta_tags = self.parser.getElementsByTag(
211 doc,
212 attr=known_meta_tag['attribute'],
213 value=known_meta_tag['value'])
214 if meta_tags:
215 date_str = self.parser.getAttribute(
216 meta_tags[0],
217 known_meta_tag['content'])
218 datetime_obj = parse_date_str(date_str)
219 if datetime_obj:
220 return datetime_obj
221
222 return None
223
224 def get_title(self, doc):
225 """Fetch the article title and analyze it

Callers 1

parseMethod · 0.80

Calls 2

getElementsByTagMethod · 0.80
getAttributeMethod · 0.80

Tested by

no test coverage detected