Alot of times the first paragraph might be the caption under an image so we'll want to make sure if we're going to boost a parent node that it should be connected to other paragraphs, at least for the first n paragraphs so we'll want to make sure that the next sibling is a
(self, node)
| 727 | return top_node |
| 728 | |
| 729 | def is_boostable(self, node): |
| 730 | """Alot of times the first paragraph might be the caption under an image |
| 731 | so we'll want to make sure if we're going to boost a parent node that |
| 732 | it should be connected to other paragraphs, at least for the first n |
| 733 | paragraphs so we'll want to make sure that the next sibling is a |
| 734 | paragraph and has at least some substantial weight to it. |
| 735 | """ |
| 736 | para = "p" |
| 737 | steps_away = 0 |
| 738 | minimum_stopword_count = 5 |
| 739 | max_stepsaway_from_node = 3 |
| 740 | |
| 741 | nodes = self.walk_siblings(node) |
| 742 | for current_node in nodes: |
| 743 | # <p> |
| 744 | current_node_tag = self.parser.getTag(current_node) |
| 745 | if current_node_tag == para: |
| 746 | if steps_away >= max_stepsaway_from_node: |
| 747 | return False |
| 748 | paraText = self.parser.getText(current_node) |
| 749 | word_stats = self.stopwords_class(language=self.language).\ |
| 750 | get_stopword_count(paraText) |
| 751 | if word_stats.get_stopword_count() > minimum_stopword_count: |
| 752 | return True |
| 753 | steps_away += 1 |
| 754 | return False |
| 755 | |
| 756 | def walk_siblings(self, node): |
| 757 | current_sibling = self.parser.previousSibling(node) |
no test coverage detected