| 5 | train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train')) |
| 6 | test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb')) |
| 7 | def word2features(sent, i): |
| 8 | word = sent[i][0] |
| 9 | postag = sent[i][1] |
| 10 | |
| 11 | features = { |
| 12 | 'bias': 1.0, |
| 13 | 'word.lower()': word.lower(), |
| 14 | 'word[-3:]': word[-3:], |
| 15 | 'word[-2:]': word[-2:], |
| 16 | 'word.isupper()': word.isupper(), |
| 17 | 'word.istitle()': word.istitle(), |
| 18 | 'word.isdigit()': word.isdigit(), |
| 19 | 'postag': postag, |
| 20 | 'postag[:2]': postag[:2], |
| 21 | } |
| 22 | if i > 0: |
| 23 | word1 = sent[i-1][0] |
| 24 | postag1 = sent[i-1][1] |
| 25 | features.update({ |
| 26 | '-1:word.lower()': word1.lower(), |
| 27 | '-1:word.istitle()': word1.istitle(), |
| 28 | '-1:word.isupper()': word1.isupper(), |
| 29 | '-1:postag': postag1, |
| 30 | '-1:postag[:2]': postag1[:2], |
| 31 | }) |
| 32 | else: |
| 33 | features['BOS'] = True |
| 34 | |
| 35 | if i < len(sent)-1: |
| 36 | word1 = sent[i+1][0] |
| 37 | postag1 = sent[i+1][1] |
| 38 | features.update({ |
| 39 | '+1:word.lower()': word1.lower(), |
| 40 | '+1:word.istitle()': word1.istitle(), |
| 41 | '+1:word.isupper()': word1.isupper(), |
| 42 | '+1:postag': postag1, |
| 43 | '+1:postag[:2]': postag1[:2], |
| 44 | }) |
| 45 | else: |
| 46 | features['EOS'] = True |
| 47 | |
| 48 | return features |
| 49 | |
| 50 | |
| 51 | def sent2features(sent): |