(self, doc)
| 655 | return set(tags) |
| 656 | |
| 657 | def calculate_best_node(self, doc): |
| 658 | top_node = None |
| 659 | nodes_to_check = self.nodes_to_check(doc) |
| 660 | starting_boost = float(1.0) |
| 661 | cnt = 0 |
| 662 | i = 0 |
| 663 | parent_nodes = [] |
| 664 | nodes_with_text = [] |
| 665 | |
| 666 | for node in nodes_to_check: |
| 667 | text_node = self.parser.getText(node) |
| 668 | word_stats = self.stopwords_class(language=self.language).\ |
| 669 | get_stopword_count(text_node) |
| 670 | high_link_density = self.is_highlink_density(node) |
| 671 | if word_stats.get_stopword_count() > 2 and not high_link_density: |
| 672 | nodes_with_text.append(node) |
| 673 | |
| 674 | nodes_number = len(nodes_with_text) |
| 675 | negative_scoring = 0 |
| 676 | bottom_negativescore_nodes = float(nodes_number) * 0.25 |
| 677 | |
| 678 | for node in nodes_with_text: |
| 679 | boost_score = float(0) |
| 680 | # boost |
| 681 | if(self.is_boostable(node)): |
| 682 | if cnt >= 0: |
| 683 | boost_score = float((1.0 / starting_boost) * 50) |
| 684 | starting_boost += 1 |
| 685 | # nodes_number |
| 686 | if nodes_number > 15: |
| 687 | if (nodes_number - i) <= bottom_negativescore_nodes: |
| 688 | booster = float( |
| 689 | bottom_negativescore_nodes - (nodes_number - i)) |
| 690 | boost_score = float(-pow(booster, float(2))) |
| 691 | negscore = abs(boost_score) + negative_scoring |
| 692 | if negscore > 40: |
| 693 | boost_score = float(5) |
| 694 | |
| 695 | text_node = self.parser.getText(node) |
| 696 | word_stats = self.stopwords_class(language=self.language).\ |
| 697 | get_stopword_count(text_node) |
| 698 | upscore = int(word_stats.get_stopword_count() + boost_score) |
| 699 | |
| 700 | parent_node = self.parser.getParent(node) |
| 701 | self.update_score(parent_node, upscore) |
| 702 | self.update_node_count(parent_node, 1) |
| 703 | |
| 704 | if parent_node not in parent_nodes: |
| 705 | parent_nodes.append(parent_node) |
| 706 | |
| 707 | # Parent of parent node |
| 708 | parent_parent_node = self.parser.getParent(parent_node) |
| 709 | if parent_parent_node is not None: |
| 710 | self.update_node_count(parent_parent_node, 1) |
| 711 | self.update_score(parent_parent_node, upscore / 2) |
| 712 | if parent_parent_node not in parent_nodes: |
| 713 | parent_nodes.append(parent_parent_node) |
| 714 | cnt += 1 |
no test coverage detected