function to create datasets+tokenizers for common options
(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.],
delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None,
tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None,
model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs)
| 59 | return corpus_name in corpora.NAMED_CORPORA |
| 60 | |
| 61 | def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=None, split=[1.], |
| 62 | delim=',', loose=False, binarize_sent=False, drop_unlabeled=False, tokenizer=None, |
| 63 | tokenizer_type='CharacterLevelTokenizer', tokenizer_model_path=None, vocab_size=None, |
| 64 | model_type='bpe', pad_token=0, character_converage=1.0, non_binary_cols=None, **kwargs): |
| 65 | """function to create datasets+tokenizers for common options""" |
| 66 | if isinstance(process_fn, str): |
| 67 | process_fn = eval(process_fn) |
| 68 | if non_binary_cols is not None: |
| 69 | # multilabel dataset support (only for csvs) |
| 70 | label_key = non_binary_cols |
| 71 | def get_dataset_from_path(path_): |
| 72 | if lazy: |
| 73 | # get lazily loaded dataset |
| 74 | named_corpora = False |
| 75 | if supported_corpus(path_): |
| 76 | named_corpora = True |
| 77 | name = path_ |
| 78 | path_ = corpora.NAMED_CORPORA[path_].PATH |
| 79 | if not exists_lazy(path_, data_type='data'): |
| 80 | # create cached version of dataset for lazy loading if it doesn't exist |
| 81 | text = get_dataset(name if named_corpora else path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, |
| 82 | delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose) |
| 83 | make_lazy(path_, text.X, data_type='data') |
| 84 | text = lazy_array_loader(path_, data_type='data', map_fn=process_fn) |
| 85 | else: |
| 86 | # get dataset |
| 87 | text = get_dataset(path_, text_key=text_key, label_key=label_key, binarize_sent=binarize_sent, |
| 88 | delim=delim, drop_unlabeled=drop_unlabeled, loose_json=loose, preprocess_fn=process_fn) |
| 89 | return text |
| 90 | # get one or multiple datasets and concatenate |
| 91 | if isinstance(path, str): |
| 92 | path = [path] |
| 93 | datasets = [get_dataset_from_path(p) for p in path] |
| 94 | if len(datasets) == 1: |
| 95 | ds = datasets[0] |
| 96 | else: |
| 97 | ds = ConcatDataset(datasets) |
| 98 | # make tokenizer for dataset |
| 99 | if tokenizer is None: |
| 100 | tokenizer = make_tokenizer(tokenizer_type, ds, tokenizer_model_path, vocab_size, model_type, |
| 101 | pad_token, character_converage, **kwargs) |
| 102 | |
| 103 | ds_type = '' |
| 104 | if 'ds_type' in kwargs: |
| 105 | ds_type = kwargs['ds_type'] |
| 106 | ds.SetTokenizer(tokenizer) |
| 107 | # Split dataset into train/val/test (and wrap bert dataset) |
| 108 | if should_split(split): |
| 109 | ds = split_ds(ds, split) |
| 110 | if ds_type.lower() == 'bert': |
| 111 | presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False |
| 112 | ds = [bert_sentencepair_dataset(d, max_seq_len=seq_length, presplit_sentences=presplit_sentences) if d is not None else None for d in ds] |
| 113 | elif ds_type.lower() == 'gpt2': |
| 114 | ds = [GPT2Dataset(d, max_seq_len=seq_length) if d is not None else None for d in ds] |
| 115 | else: |
| 116 | if ds_type.lower() == 'bert': |
| 117 | presplit_sentences = kwargs['presplit_sentences'] if 'presplit_sentences' in kwargs else False |
| 118 | ds = bert_sentencepair_dataset(ds, max_seq_len=seq_length, presplit_sentences=presplit_sentences) |
nothing calls this directly
no test coverage detected