Metadata for a document.
| 41 | |
| 42 | |
| 43 | class DocMetaData(BaseModel): |
| 44 | """Metadata for a document.""" |
| 45 | |
| 46 | source: str = "context" # just reference |
| 47 | source_content: str = "context" # reference and content |
| 48 | title: str = "Unknown Title" |
| 49 | published_date: str = "Unknown Date" |
| 50 | is_chunk: bool = False # if it is a chunk, don't split |
| 51 | id: str = Field(default_factory=lambda: str(uuid4())) |
| 52 | window_ids: List[str] = [] # for RAG: ids of chunks around this one |
| 53 | |
| 54 | @field_validator("id", mode="before") |
| 55 | @classmethod |
| 56 | def convert_id_to_string(cls, v: Any) -> str: |
| 57 | """Convert id to string if it's not already.""" |
| 58 | if v is None: |
| 59 | return str(uuid4()) |
| 60 | return str(v) |
| 61 | |
| 62 | def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: |
| 63 | """ |
| 64 | Special dict method to convert bool fields to int, to appease some |
| 65 | downstream libraries, e.g. Chroma which complains about bool fields in |
| 66 | metadata. |
| 67 | """ |
| 68 | original_dict = super().model_dump(*args, **kwargs) |
| 69 | |
| 70 | for key, value in original_dict.items(): |
| 71 | if isinstance(value, bool): |
| 72 | original_dict[key] = 1 * value |
| 73 | |
| 74 | return original_dict |
| 75 | |
| 76 | def __str__(self) -> str: |
| 77 | title_str = ( |
| 78 | "" |
| 79 | if "unknown" in self.title.lower() or self.title.strip() == "" |
| 80 | else f"Title: {self.title}" |
| 81 | ) |
| 82 | date_str = "" |
| 83 | if ( |
| 84 | "unknown" not in self.published_date.lower() |
| 85 | and self.published_date.strip() != "" |
| 86 | ): |
| 87 | try: |
| 88 | from dateutil import parser |
| 89 | |
| 90 | # Try to parse the date string |
| 91 | date_obj = parser.parse(self.published_date) |
| 92 | # Format to include only the date part (year-month-day) |
| 93 | date_only = date_obj.strftime("%Y-%m-%d") |
| 94 | date_str = f"Date: {date_only}" |
| 95 | except (ValueError, ImportError, TypeError): |
| 96 | # If parsing fails, just use the original date |
| 97 | date_str = f"Date: {self.published_date}" |
| 98 | components = [self.source] + ( |
| 99 | [] if title_str + date_str == "" else [title_str, date_str] |
| 100 | ) |
no outgoing calls
searching dependent graphs…