MCPcopy
hub / github.com/langroid/langroid / DocMetaData

Class DocMetaData

langroid/mytypes.py:43–103  ·  view source on GitHub ↗

Metadata for a document.

Source from the content-addressed store, hash-verified

41
42
43class DocMetaData(BaseModel):
44 """Metadata for a document."""
45
46 source: str = "context" # just reference
47 source_content: str = "context" # reference and content
48 title: str = "Unknown Title"
49 published_date: str = "Unknown Date"
50 is_chunk: bool = False # if it is a chunk, don't split
51 id: str = Field(default_factory=lambda: str(uuid4()))
52 window_ids: List[str] = [] # for RAG: ids of chunks around this one
53
54 @field_validator("id", mode="before")
55 @classmethod
56 def convert_id_to_string(cls, v: Any) -> str:
57 """Convert id to string if it's not already."""
58 if v is None:
59 return str(uuid4())
60 return str(v)
61
62 def dict_bool_int(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
63 """
64 Special dict method to convert bool fields to int, to appease some
65 downstream libraries, e.g. Chroma which complains about bool fields in
66 metadata.
67 """
68 original_dict = super().model_dump(*args, **kwargs)
69
70 for key, value in original_dict.items():
71 if isinstance(value, bool):
72 original_dict[key] = 1 * value
73
74 return original_dict
75
76 def __str__(self) -> str:
77 title_str = (
78 ""
79 if "unknown" in self.title.lower() or self.title.strip() == ""
80 else f"Title: {self.title}"
81 )
82 date_str = ""
83 if (
84 "unknown" not in self.published_date.lower()
85 and self.published_date.strip() != ""
86 ):
87 try:
88 from dateutil import parser
89
90 # Try to parse the date string
91 date_obj = parser.parse(self.published_date)
92 # Format to include only the date part (year-month-day)
93 date_only = date_obj.strftime("%Y-%m-%d")
94 date_str = f"Date: {date_only}"
95 except (ValueError, ImportError, TypeError):
96 # If parsing fails, just use the original date
97 date_str = f"Date: {self.published_date}"
98 components = [self.source] + (
99 [] if title_str + date_str == "" else [title_str, date_str]
100 )

Callers 15

get_docMethod · 0.90
get_doc_chunksMethod · 0.90

Calls

no outgoing calls

Used in the wild real call sites across dependent graphs

searching dependent graphs…