MCPcopy
hub / github.com/google-deepmind/gemma / map

Method map

gemma/gm/data/_tasks.py:99–155  ·  view source on GitHub ↗
(self, element)

Source from the content-addressed store, hash-verified

97 sampling: bool = False
98
99 def map(self, element):
100 # Deep-copy to avoid mutating the input.
101 element = etree.copy(element)
102
103 # Extract the values from the `dict` example.
104 # `kontext.get_by_path(element, self.in_prompt)` is equivalent to
105 # `element[self.in_prompt]`, but supports nested dicts and dataclasses.
106 prompt = kd.kontext.get_by_path(element, self.in_prompt)
107 response = kd.kontext.get_by_path(element, self.in_response)
108
109 # TODO(epot): Supports nested drop
110 if self.drop_inputs:
111 del element[self.in_prompt]
112 del element[self.in_response]
113
114 # Some datasets (TFDS) returns `bytes` instead of `str`, so decode them.
115 prompt = _decode_bytes(prompt)
116 response = _decode_bytes(response)
117
118 # Format the input to match the expected dialog template.
119 # TODO(epot): Add a `template` protocol to allow customizing this.
120 prompt = _template.PROMPT.format(prompt)
121 response = _template.ANSWER.format(response)
122
123 # For sampling, we don't need to tokenize the input.
124 if self.sampling:
125 kd.kontext.set_by_path(element, self.out_input, prompt)
126 kd.kontext.set_by_path(element, self.out_target, response)
127 return element
128
129 # Tokenize the input and the responses.
130 prompt = self.tokenizer.encode(prompt, add_bos=True)
131 response = self.tokenizer.encode(response)
132
133 # Create the model inputs/targets/loss_mask.
134 out = _functional.make_seq2seq_fields(
135 prompt=prompt,
136 response=response,
137 )
138
139 # Add padding.
140 out = _functional.pad(
141 out,
142 max_length=self.max_length,
143 truncate=self.truncate,
144 )
145
146 # For shape compatibility with the loss
147 target = einops.rearrange(out.target, "... -> ... 1")
148 target_mask = einops.rearrange(out.target_mask, "... -> ... 1")
149
150 # Add the fields to the output `dict`.
151 # Equivalent to `element[self.out_input] = ...`
152 kd.kontext.set_by_path(element, self.out_input, out.input)
153 kd.kontext.set_by_path(element, self.out_target, target)
154 kd.kontext.set_by_path(element, self.out_target_mask, target_mask)
155 return element
156

Callers 15

_merge_initial_cacheFunction · 0.45
decoratedFunction · 0.45
padFunction · 0.45
mapMethod · 0.45
transformMethod · 0.45
load_paramsFunction · 0.45
_wrap_skipFunction · 0.45
_unwrap_skipFunction · 0.45
_release_skipFunction · 0.45
release_memoryFunction · 0.45

Calls 3

_decode_bytesFunction · 0.85
padMethod · 0.80
encodeMethod · 0.45

Tested by 2

test_moduleFunction · 0.36