Returns a list of shortest paths from the source to target pages by running a bi-directional breadth-first search on the graph of Wikipedia pages. Args: source_page_id: The page at which to start the search. target_page_id: The page at which to end the search. database: An Database
(source_page_id, target_page_id, database)
| 34 | |
| 35 | |
| 36 | def breadth_first_search(source_page_id, target_page_id, database): |
| 37 | """Returns a list of shortest paths from the source to target pages by running a bi-directional |
| 38 | breadth-first search on the graph of Wikipedia pages. |
| 39 | |
| 40 | Args: |
| 41 | source_page_id: The page at which to start the search. |
| 42 | target_page_id: The page at which to end the search. |
| 43 | database: An Database instance which contains methods to query the Wikipedia database. |
| 44 | |
| 45 | Returns: |
| 46 | list(list(int)): A list of lists of page IDs corresponding to paths from the source page to the |
| 47 | target page. |
| 48 | """ |
| 49 | # If the source and target page IDs are identical, return the trivial path. |
| 50 | if source_page_id == target_page_id: |
| 51 | return [[source_page_id]] |
| 52 | |
| 53 | paths = [] |
| 54 | |
| 55 | # The unvisited dictionaries are a mapping from page ID to a list of that page's parents' IDs. |
| 56 | # None signifies that the source and target pages have no parent. |
| 57 | unvisited_forward = {source_page_id: [None]} |
| 58 | unvisited_backward = {target_page_id: [None]} |
| 59 | |
| 60 | # The visited dictionaries are a mapping from page ID to a list of that page's parents' IDs. |
| 61 | visited_forward = {} |
| 62 | visited_backward = {} |
| 63 | |
| 64 | # Set the initial forward and backward depths to 0. |
| 65 | forward_depth = 0 |
| 66 | backward_depth = 0 |
| 67 | |
| 68 | # Continue the breadth first search until a path has been found or either of the unvisited lists |
| 69 | # are empty. |
| 70 | while (len(paths) == 0) and ((len(unvisited_forward) != 0) and (len(unvisited_backward) != 0)): |
| 71 | # Run the next iteration of the breadth first search in whichever direction has the smaller number |
| 72 | # of links at the next level. |
| 73 | forward_links_count = database.fetch_outgoing_links_count(unvisited_forward.keys()) |
| 74 | backward_links_count = database.fetch_incoming_links_count(unvisited_backward.keys()) |
| 75 | |
| 76 | if forward_links_count < backward_links_count: |
| 77 | #--- FORWARD BREADTH FIRST SEARCH ---# |
| 78 | forward_depth += 1 |
| 79 | |
| 80 | # Fetch the pages which can be reached from the currently unvisited forward pages. |
| 81 | # The replace() bit is some hackery to handle Python printing a trailing ',' when there is |
| 82 | # only one key. |
| 83 | outgoing_links = database.fetch_outgoing_links(unvisited_forward.keys()) |
| 84 | |
| 85 | # Mark all of the unvisited forward pages as visited. |
| 86 | for page_id in unvisited_forward: |
| 87 | visited_forward[page_id] = unvisited_forward[page_id] |
| 88 | |
| 89 | # Clear the unvisited forward dictionary. |
| 90 | unvisited_forward.clear() |
| 91 | |
| 92 | for source_page_id, target_page_ids in outgoing_links: |
| 93 | for target_page_id in target_page_ids.split('|'): |
no test coverage detected