Tokenize a sentence and yields tuples of (word, start, end) Parameter: - sentence: the str(unicode) to be segmented. - mode: "default" or "search", "search" is for finer segmentation. - HMM: whether to use the Hidden Markov Model.
(self, unicode_sentence, mode="default", HMM=True)
| 474 | return freq |
| 475 | |
| 476 | def tokenize(self, unicode_sentence, mode="default", HMM=True): |
| 477 | """ |
| 478 | Tokenize a sentence and yields tuples of (word, start, end) |
| 479 | |
| 480 | Parameter: |
| 481 | - sentence: the str(unicode) to be segmented. |
| 482 | - mode: "default" or "search", "search" is for finer segmentation. |
| 483 | - HMM: whether to use the Hidden Markov Model. |
| 484 | """ |
| 485 | if not isinstance(unicode_sentence, text_type): |
| 486 | raise ValueError("jieba: the input parameter should be unicode.") |
| 487 | start = 0 |
| 488 | if mode == 'default': |
| 489 | for w in self.cut(unicode_sentence, HMM=HMM): |
| 490 | width = len(w) |
| 491 | yield (w, start, start + width) |
| 492 | start += width |
| 493 | else: |
| 494 | for w in self.cut(unicode_sentence, HMM=HMM): |
| 495 | width = len(w) |
| 496 | if len(w) > 2: |
| 497 | for i in xrange(len(w) - 1): |
| 498 | gram2 = w[i:i + 2] |
| 499 | if self.FREQ.get(gram2): |
| 500 | yield (gram2, start + i, start + i + 2) |
| 501 | if len(w) > 3: |
| 502 | for i in xrange(len(w) - 2): |
| 503 | gram3 = w[i:i + 3] |
| 504 | if self.FREQ.get(gram3): |
| 505 | yield (gram3, start + i, start + i + 3) |
| 506 | yield (w, start, start + width) |
| 507 | start += width |
| 508 | |
| 509 | def set_dictionary(self, dictionary_path): |
| 510 | with self.lock: |