MCPcopy
hub / github.com/XPixelGroup/DiffBIR / preprocess_plain

Function preprocess_plain

llava/train/train.py:588–607  ·  view source on GitHub ↗
(
    sources: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
)

Source from the content-addressed store, hash-verified

586
587
588def preprocess_plain(
589 sources: Sequence[str],
590 tokenizer: transformers.PreTrainedTokenizer,
591) -> Dict:
592 # add end signal and concatenate together
593 conversations = []
594 for source in sources:
595 assert len(source) == 2
596 assert DEFAULT_IMAGE_TOKEN in source[0]['value']
597 source[0]['value'] = DEFAULT_IMAGE_TOKEN
598 conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
599 conversations.append(conversation)
600 # tokenize conversations
601 input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
602 targets = copy.deepcopy(input_ids)
603 for target, source in zip(targets, sources):
604 tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
605 target[:tokenized_len] = IGNORE_INDEX
606
607 return dict(input_ids=input_ids, labels=targets)
608
609
610def preprocess(

Callers 1

preprocessFunction · 0.85

Calls 1

tokenizer_image_tokenFunction · 0.90

Tested by

no test coverage detected