Extract functions, classes, and includes from a .cpp/.cc/.cxx/.hpp file.
(path: Path)
| 1074 | |
| 1075 | |
| 1076 | def extract_cpp(path: Path) -> dict: |
| 1077 | """Extract functions, classes, and includes from a .cpp/.cc/.cxx/.hpp file.""" |
| 1078 | try: |
| 1079 | import tree_sitter_cpp as tscpp |
| 1080 | from tree_sitter import Language, Parser |
| 1081 | except ImportError: |
| 1082 | return {"nodes": [], "edges": [], "error": "tree-sitter-cpp not installed"} |
| 1083 | |
| 1084 | try: |
| 1085 | language = Language(tscpp.language()) |
| 1086 | parser = Parser(language) |
| 1087 | source = path.read_bytes() |
| 1088 | tree = parser.parse(source) |
| 1089 | root = tree.root_node |
| 1090 | except Exception as e: |
| 1091 | return {"nodes": [], "edges": [], "error": str(e)} |
| 1092 | |
| 1093 | stem = path.stem |
| 1094 | str_path = str(path) |
| 1095 | nodes: list[dict] = [] |
| 1096 | edges: list[dict] = [] |
| 1097 | seen_ids: set[str] = set() |
| 1098 | |
| 1099 | def add_node(nid: str, label: str, line: int) -> None: |
| 1100 | if nid not in seen_ids: |
| 1101 | seen_ids.add(nid) |
| 1102 | nodes.append({ |
| 1103 | "id": nid, |
| 1104 | "label": label, |
| 1105 | "file_type": "code", |
| 1106 | "source_file": str_path, |
| 1107 | "source_location": f"L{line}", |
| 1108 | }) |
| 1109 | |
| 1110 | def add_edge_raw(src: str, tgt: str, relation: str, line: int, confidence: str = "EXTRACTED", weight: float = 1.0) -> None: |
| 1111 | edges.append({ |
| 1112 | "source": src, |
| 1113 | "target": tgt, |
| 1114 | "relation": relation, |
| 1115 | "confidence": confidence, |
| 1116 | "source_file": str_path, |
| 1117 | "source_location": f"L{line}", |
| 1118 | "weight": weight, |
| 1119 | }) |
| 1120 | |
| 1121 | file_nid = _make_id(stem) |
| 1122 | add_node(file_nid, path.name, 1) |
| 1123 | |
| 1124 | function_bodies: list[tuple[str, object]] = [] |
| 1125 | |
| 1126 | def _get_func_name_from_declarator(node) -> str | None: |
| 1127 | """Recursively unwrap declarator to find the innermost identifier.""" |
| 1128 | if node.type == "identifier": |
| 1129 | return source[node.start_byte:node.end_byte].decode("utf-8", errors="replace") |
| 1130 | if node.type == "qualified_identifier": |
| 1131 | name_node = node.child_by_field_name("name") |
| 1132 | if name_node: |
| 1133 | return source[name_node.start_byte:name_node.end_byte].decode("utf-8", errors="replace") |