hub / github.com/jwngr/sdow / fetch_page

Method fetch_page

sdow/database.py:31–84 · view source on GitHub ↗

Returns the ID and title of the non-redirect page corresponding to the provided title, handling titles with incorrect capitalization as well as redirects. Args: page_title: The title of the page to fetch. Returns: (int, str, bool): A tuple containing the page ID, title, and

(self, page_title)

Source from the content-addressed store, hash-verified

29	self.searches_cursor.arraysize = 1000
30
31	def fetch_page(self, page_title):
32	"""Returns the ID and title of the non-redirect page corresponding to the provided title,
33	handling titles with incorrect capitalization as well as redirects.
34
35	Args:
36	page_title: The title of the page to fetch.
37
38	Returns:
39	(int, str, bool): A tuple containing the page ID, title, and whether or not a redirect was
40	followed.
41	OR
42	None: If no page exists.
43
44	Raises:
45	ValueError: If the provided page title is invalid.
46	"""
47	sanitized_page_title = helpers.get_sanitized_page_title(page_title)
48
49	query = 'SELECT * FROM pages WHERE title = ? COLLATE NOCASE;'
50	query_bindings = (sanitized_page_title,)
51	self.sdow_cursor.execute(query, query_bindings)
52
53	# Because the above query is case-insensitive (due to the COLLATE NOCASE), multiple articles
54	# can be matched.
55	results = self.sdow_cursor.fetchall()
56
57	if not results:
58	raise ValueError(
59	'Invalid page title {0} provided. Page title does not exist.'.format(page_title))
60
61	# First, look for a non-redirect page which has exact match with the page title.
62	for current_page_id, current_page_title, current_page_is_redirect in results:
63	if current_page_title == sanitized_page_title and not current_page_is_redirect:
64	return (current_page_id, helpers.get_readable_page_title(current_page_title), False)
65
66	# Next, look for a match with a non-redirect page.
67	for current_page_id, current_page_title, current_page_is_redirect in results:
68	if not current_page_is_redirect:
69	return (current_page_id, helpers.get_readable_page_title(current_page_title), False)
70
71	# If all the results are redirects, use the page to which the first result redirects.
72	query = 'SELECT target_id, title FROM redirects INNER JOIN pages ON pages.id = target_id WHERE source_id = ?;'
73	query_bindings = (results[0][0],)
74	self.sdow_cursor.execute(query, query_bindings)
75
76	result = self.sdow_cursor.fetchone()
77
78	# TODO: This will no longer be required once the April 2018 database dump occurs since this
79	# scenario is prevented by the prune_pages_file.py Python script during the database creation.
80	if not result:
81	raise ValueError(
82	'Invalid page title {0} provided. Page title does not exist.'.format(page_title))
83
84	return (result[0], helpers.get_readable_page_title(result[1]), True)
85
86	def fetch_page_title(self, page_id):
87	"""Returns the page title corresponding to the provided page ID.

Callers 1

shortest_paths_routeFunction · 0.80

Calls

no outgoing calls

Tested by

no test coverage detected