Takes a candidate line of html or text and extracts out the name(s) in list form >>> search_str(' By: Lucas Ou-Yang , \ Alex Smith ') ['Lucas Ou-Yang', 'Alex Smith']
(search_str)
| 92 | return result |
| 93 | |
| 94 | def parse_byline(search_str): |
| 95 | """Takes a candidate line of html or text and |
| 96 | extracts out the name(s) in list form |
| 97 | >>> search_str('<div>By: <strong>Lucas Ou-Yang</strong>, \ |
| 98 | <strong>Alex Smith</strong></div>') |
| 99 | ['Lucas Ou-Yang', 'Alex Smith'] |
| 100 | """ |
| 101 | # Remove HTML boilerplate |
| 102 | search_str = re.sub('<[^<]+?>', '', search_str) |
| 103 | |
| 104 | # Remove original By statement |
| 105 | search_str = re.sub('[bB][yY][\:\s]|[fF]rom[\:\s]', '', search_str) |
| 106 | |
| 107 | search_str = search_str.strip() |
| 108 | |
| 109 | # Chunk the line by non alphanumeric tokens (few name exceptions) |
| 110 | # >>> re.split("[^\w\'\-\.]", "Tyler G. Jones, Lucas Ou, Dean O'Brian and Ronald") |
| 111 | # ['Tyler', 'G.', 'Jones', '', 'Lucas', 'Ou', '', 'Dean', "O'Brian", 'and', 'Ronald'] |
| 112 | name_tokens = re.split("[^\w\'\-\.]", search_str) |
| 113 | name_tokens = [s.strip() for s in name_tokens] |
| 114 | |
| 115 | _authors = [] |
| 116 | # List of first, last name tokens |
| 117 | curname = [] |
| 118 | DELIM = ['and', ',', ''] |
| 119 | |
| 120 | for token in name_tokens: |
| 121 | if token in DELIM: |
| 122 | if len(curname) > 0: |
| 123 | _authors.append(' '.join(curname)) |
| 124 | curname = [] |
| 125 | |
| 126 | elif not contains_digits(token): |
| 127 | curname.append(token) |
| 128 | |
| 129 | # One last check at end |
| 130 | valid_name = (len(curname) >= 2) |
| 131 | if valid_name: |
| 132 | _authors.append(' '.join(curname)) |
| 133 | |
| 134 | return _authors |
| 135 | |
| 136 | # Try 1: Search popular author tags for authors |
| 137 |