Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes
(sentences, char_to_id, tag_to_id, lower=False, train=True)
| 89 | |
| 90 | |
| 91 | def prepare_dataset(sentences, char_to_id, tag_to_id, lower=False, train=True): |
| 92 | """ |
| 93 | Prepare the dataset. Return a list of lists of dictionaries containing: |
| 94 | - word indexes |
| 95 | - word char indexes |
| 96 | - tag indexes |
| 97 | """ |
| 98 | |
| 99 | none_index = tag_to_id["O"] |
| 100 | |
| 101 | def f(x): |
| 102 | return x.lower() if lower else x |
| 103 | data = [] |
| 104 | for s in sentences: |
| 105 | string = [w[0] for w in s] |
| 106 | chars = [char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] |
| 107 | for w in string] |
| 108 | segs = get_seg_features("".join(string)) |
| 109 | if train: |
| 110 | tags = [tag_to_id[w[-1]] for w in s] |
| 111 | else: |
| 112 | tags = [none_index for _ in chars] |
| 113 | data.append([string, chars, segs, tags]) |
| 114 | |
| 115 | return data |
| 116 | |
| 117 | |
| 118 | def augment_with_pretrained(dictionary, ext_emb_path, chars): |
no test coverage detected