MCPcopy
hub / github.com/codelucas/newspaper / calculate_best_node

Method calculate_best_node

newspaper/extractors.py:657–727  ·  view source on GitHub ↗
(self, doc)

Source from the content-addressed store, hash-verified

655 return set(tags)
656
657 def calculate_best_node(self, doc):
658 top_node = None
659 nodes_to_check = self.nodes_to_check(doc)
660 starting_boost = float(1.0)
661 cnt = 0
662 i = 0
663 parent_nodes = []
664 nodes_with_text = []
665
666 for node in nodes_to_check:
667 text_node = self.parser.getText(node)
668 word_stats = self.stopwords_class(language=self.language).\
669 get_stopword_count(text_node)
670 high_link_density = self.is_highlink_density(node)
671 if word_stats.get_stopword_count() > 2 and not high_link_density:
672 nodes_with_text.append(node)
673
674 nodes_number = len(nodes_with_text)
675 negative_scoring = 0
676 bottom_negativescore_nodes = float(nodes_number) * 0.25
677
678 for node in nodes_with_text:
679 boost_score = float(0)
680 # boost
681 if(self.is_boostable(node)):
682 if cnt >= 0:
683 boost_score = float((1.0 / starting_boost) * 50)
684 starting_boost += 1
685 # nodes_number
686 if nodes_number > 15:
687 if (nodes_number - i) <= bottom_negativescore_nodes:
688 booster = float(
689 bottom_negativescore_nodes - (nodes_number - i))
690 boost_score = float(-pow(booster, float(2)))
691 negscore = abs(boost_score) + negative_scoring
692 if negscore > 40:
693 boost_score = float(5)
694
695 text_node = self.parser.getText(node)
696 word_stats = self.stopwords_class(language=self.language).\
697 get_stopword_count(text_node)
698 upscore = int(word_stats.get_stopword_count() + boost_score)
699
700 parent_node = self.parser.getParent(node)
701 self.update_score(parent_node, upscore)
702 self.update_node_count(parent_node, 1)
703
704 if parent_node not in parent_nodes:
705 parent_nodes.append(parent_node)
706
707 # Parent of parent node
708 parent_parent_node = self.parser.getParent(parent_node)
709 if parent_parent_node is not None:
710 self.update_node_count(parent_parent_node, 1)
711 self.update_score(parent_parent_node, upscore / 2)
712 if parent_parent_node not in parent_nodes:
713 parent_nodes.append(parent_parent_node)
714 cnt += 1

Callers 2

fulltextFunction · 0.95
parseMethod · 0.80

Calls 10

nodes_to_checkMethod · 0.95
is_highlink_densityMethod · 0.95
is_boostableMethod · 0.95
update_scoreMethod · 0.95
update_node_countMethod · 0.95
get_scoreMethod · 0.95
getTextMethod · 0.80
appendMethod · 0.80
getParentMethod · 0.80
get_stopword_countMethod · 0.45

Tested by

no test coverage detected