MCPcopy
hub / github.com/jwngr/sdow / fetch_page

Method fetch_page

sdow/database.py:31–84  ·  view source on GitHub ↗

Returns the ID and title of the non-redirect page corresponding to the provided title, handling titles with incorrect capitalization as well as redirects. Args: page_title: The title of the page to fetch. Returns: (int, str, bool): A tuple containing the page ID, title, and

(self, page_title)

Source from the content-addressed store, hash-verified

29 self.searches_cursor.arraysize = 1000
30
31 def fetch_page(self, page_title):
32 """Returns the ID and title of the non-redirect page corresponding to the provided title,
33 handling titles with incorrect capitalization as well as redirects.
34
35 Args:
36 page_title: The title of the page to fetch.
37
38 Returns:
39 (int, str, bool): A tuple containing the page ID, title, and whether or not a redirect was
40 followed.
41 OR
42 None: If no page exists.
43
44 Raises:
45 ValueError: If the provided page title is invalid.
46 """
47 sanitized_page_title = helpers.get_sanitized_page_title(page_title)
48
49 query = 'SELECT * FROM pages WHERE title = ? COLLATE NOCASE;'
50 query_bindings = (sanitized_page_title,)
51 self.sdow_cursor.execute(query, query_bindings)
52
53 # Because the above query is case-insensitive (due to the COLLATE NOCASE), multiple articles
54 # can be matched.
55 results = self.sdow_cursor.fetchall()
56
57 if not results:
58 raise ValueError(
59 'Invalid page title {0} provided. Page title does not exist.'.format(page_title))
60
61 # First, look for a non-redirect page which has exact match with the page title.
62 for current_page_id, current_page_title, current_page_is_redirect in results:
63 if current_page_title == sanitized_page_title and not current_page_is_redirect:
64 return (current_page_id, helpers.get_readable_page_title(current_page_title), False)
65
66 # Next, look for a match with a non-redirect page.
67 for current_page_id, current_page_title, current_page_is_redirect in results:
68 if not current_page_is_redirect:
69 return (current_page_id, helpers.get_readable_page_title(current_page_title), False)
70
71 # If all the results are redirects, use the page to which the first result redirects.
72 query = 'SELECT target_id, title FROM redirects INNER JOIN pages ON pages.id = target_id WHERE source_id = ?;'
73 query_bindings = (results[0][0],)
74 self.sdow_cursor.execute(query, query_bindings)
75
76 result = self.sdow_cursor.fetchone()
77
78 # TODO: This will no longer be required once the April 2018 database dump occurs since this
79 # scenario is prevented by the prune_pages_file.py Python script during the database creation.
80 if not result:
81 raise ValueError(
82 'Invalid page title {0} provided. Page title does not exist.'.format(page_title))
83
84 return (result[0], helpers.get_readable_page_title(result[1]), True)
85
86 def fetch_page_title(self, page_id):
87 """Returns the page title corresponding to the provided page ID.

Callers 1

shortest_paths_routeFunction · 0.80

Calls

no outgoing calls

Tested by

no test coverage detected