hub / github.com/codelucas/newspaper / get_publishing_date

Method get_publishing_date

newspaper/extractors.py:172–222 · view source on GitHub ↗

3 strategies for publishing date extraction. The strategies are descending in accuracy and the next strategy is only attempted if a preferred one fails. 1. Pubdate from URL 2. Pubdate from metadata 3. Raw regex searches in the HTML + added heuristics

(self, url, doc)

Source from the content-addressed store, hash-verified

170	# return authors
171
172	def get_publishing_date(self, url, doc):
173	"""3 strategies for publishing date extraction. The strategies
174	are descending in accuracy and the next strategy is only
175	attempted if a preferred one fails.
176
177	1. Pubdate from URL
178	2. Pubdate from metadata
179	3. Raw regex searches in the HTML + added heuristics
180	"""
181
182	def parse_date_str(date_str):
183	try:
184	datetime_obj = date_parser(date_str)
185	return datetime_obj
186	except:
187	# near all parse failures are due to URL dates without a day
188	# specifier, e.g. /2014/04/
189	return None
190
191	date_match = re.search(urls.DATE_REGEX, url)
192	if date_match:
193	date_str = date_match.group(0)
194	datetime_obj = parse_date_str(date_str)
195	if datetime_obj:
196	return datetime_obj
197
198	PUBLISH_DATE_TAGS = [
199	{'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'},
200	{'attribute': 'property', 'value': 'article:published_time', 'content': 'content'},
201	{'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'},
202	{'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'},
203	{'attribute': 'property', 'value': 'og:published_time', 'content': 'content'},
204	{'attribute': 'name', 'value': 'article_date_original', 'content': 'content'},
205	{'attribute': 'name', 'value': 'publication_date', 'content': 'content'},
206	{'attribute': 'name', 'value': 'sailthru.date', 'content': 'content'},
207	{'attribute': 'name', 'value': 'PublishDate', 'content': 'content'},
208	]
209	for known_meta_tag in PUBLISH_DATE_TAGS:
210	meta_tags = self.parser.getElementsByTag(
211	doc,
212	attr=known_meta_tag['attribute'],
213	value=known_meta_tag['value'])
214	if meta_tags:
215	date_str = self.parser.getAttribute(
216	meta_tags[0],
217	known_meta_tag['content'])
218	datetime_obj = parse_date_str(date_str)
219	if datetime_obj:
220	return datetime_obj
221
222	return None
223
224	def get_title(self, doc):
225	"""Fetch the article title and analyze it

Callers 1

parseMethod · 0.80

Calls 2

getElementsByTagMethod · 0.80

getAttributeMethod · 0.80

Tested by

no test coverage detected