Load conversations from a JSONL file. Each line should be a JSON array of message objects with 'role' and 'content' fields. Example line: [{"role":"user","content":"Hi"},{"role":"assistant","content":"Hello"}]
| 8 | from tasks.common import Task |
| 9 | |
| 10 | class CustomJSON(Task): |
| 11 | """ |
| 12 | Load conversations from a JSONL file. |
| 13 | Each line should be a JSON array of message objects with 'role' and 'content' fields. |
| 14 | Example line: [{"role":"user","content":"Hi"},{"role":"assistant","content":"Hello"}] |
| 15 | """ |
| 16 | |
| 17 | def __init__(self, filepath, **kwargs): |
| 18 | super().__init__(**kwargs) |
| 19 | self.filepath = filepath |
| 20 | self.conversations = [] |
| 21 | |
| 22 | # Load all conversations from the JSONL file |
| 23 | if not os.path.exists(filepath): |
| 24 | # Helpful error message due to recent change. Will be removed in the future. |
| 25 | print("-" * 80) |
| 26 | print(f"Warning: File {filepath} does not exist") |
| 27 | print("HINT (Oct 21 2025)") |
| 28 | print("If you recently did a git pull and suddenly see this, it might be due to the new addition of identity conversations") |
| 29 | print("See this discussion for more details: https://github.com/karpathy/nanochat/discussions/139") |
| 30 | print("Quick fix: simply run the following command to download the file and you're done:") |
| 31 | print(f"curl -L -o {filepath} https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl") |
| 32 | print("-" * 80) |
| 33 | |
| 34 | else: |
| 35 | with open(filepath, 'r', encoding='utf-8') as f: |
| 36 | for line in f: |
| 37 | line = line.strip() |
| 38 | if not line: # skip empty lines |
| 39 | continue |
| 40 | messages = json.loads(line) |
| 41 | # Validate the conversation structure |
| 42 | assert isinstance(messages, list), f"Expected list of messages, got {type(messages)}" |
| 43 | assert len(messages) >= 2, f"Conversation must have at least 2 messages, got {len(messages)}" |
| 44 | # Validate message structure and alternating roles |
| 45 | for i, message in enumerate(messages): |
| 46 | assert "role" in message, f"Message {i} missing 'role' field" |
| 47 | assert "content" in message, f"Message {i} missing 'content' field" |
| 48 | expected_role = "user" if i % 2 == 0 else "assistant" |
| 49 | assert message["role"] == expected_role, f"Message {i} has role {message['role']} but should be {expected_role}" |
| 50 | assert isinstance(message["content"], str), f"Message {i} content must be a string" |
| 51 | |
| 52 | self.conversations.append(messages) |
| 53 | |
| 54 | self.length = len(self.conversations) |
| 55 | |
| 56 | def num_examples(self): |
| 57 | return self.length |
| 58 | |
| 59 | def get_example(self, index): |
| 60 | messages = self.conversations[index] |
| 61 | conversation = { |
| 62 | "messages": messages, |
| 63 | } |
| 64 | return conversation |
| 65 |