MCPcopy
hub / github.com/codelucas/newspaper / get_title

Method get_title

newspaper/extractors.py:224–267  ·  view source on GitHub ↗

Fetch the article title and analyze it

(self, doc)

Source from the content-addressed store, hash-verified

222 return None
223
224 def get_title(self, doc):
225 """Fetch the article title and analyze it
226 """
227 title = ''
228 title_element = self.parser.getElementsByTag(doc, tag='title')
229 # no title found
230 if title_element is None or len(title_element) == 0:
231 return title
232
233 # title elem found
234 title_text = self.parser.getText(title_element[0])
235 used_delimeter = False
236
237 # split title with |
238 if '|' in title_text:
239 title_text = self.split_title(title_text, PIPE_SPLITTER)
240 used_delimeter = True
241
242 # split title with -
243 if not used_delimeter and '-' in title_text:
244 title_text = self.split_title(title_text, DASH_SPLITTER)
245 used_delimeter = True
246
247 # split title with _
248 if not used_delimeter and '_' in title_text:
249 title_text = self.split_title(title_text, UNDERSCORE_SPLITTER)
250
251 # split title with /
252 if not used_delimeter and '/' in title_text:
253 title_text = self.split_title(title_text, SLASH_SPLITTER)
254 used_delimeter = True
255
256 # split title with »
257 if not used_delimeter and '»' in title_text:
258 title_text = self.split_title(title_text, ARROWS_SPLITTER)
259 used_delimeter = True
260
261 # split title with :
262 if not used_delimeter and ':' in title_text:
263 title_text = self.split_title(title_text, COLON_SPLITTER)
264 used_delimeter = True
265
266 title = MOTLEY_REPLACEMENT.replaceAll(title_text)
267 return title
268
269 def split_title(self, title, splitter):
270 """Split the title to best part possible

Callers 1

parseMethod · 0.80

Calls 4

split_titleMethod · 0.95
getElementsByTagMethod · 0.80
getTextMethod · 0.80
replaceAllMethod · 0.45

Tested by

no test coverage detected