MCPcopy Index your code
hub / github.com/ScrapeGraphAI/Scrapegraph-ai / _extract_urls

Method _extract_urls

scrapegraphai/nodes/parse_node.py:131–177  ·  view source on GitHub ↗

Extracts URLs from the given text. Args: text (str): The text to extract URLs from. Returns: Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.

(self, text: str, source: str)

Source from the content-addressed store, hash-verified

129 return state
130
131 def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
132 """
133 Extracts URLs from the given text.
134
135 Args:
136 text (str): The text to extract URLs from.
137
138 Returns:
139 Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
140 """
141 if not self.parse_urls:
142 return [], []
143
144 image_extensions = default_filters.filter_dict["img_exts"]
145 url = ""
146 all_urls = set()
147
148 for group in ParseNode.url_pattern.findall(text):
149 for el in group:
150 if el != "":
151 url += el
152 all_urls.add(url)
153 url = ""
154
155 url = ""
156 for group in ParseNode.relative_url_pattern.findall(text):
157 for el in group:
158 if el not in ["", "[", "]", "(", ")", "{", "}"]:
159 url += el
160 all_urls.add(urljoin(source, url))
161 url = ""
162
163 all_urls = list(all_urls)
164 all_urls = self._clean_urls(all_urls)
165 if not source.startswith("http"):
166 all_urls = [url for url in all_urls if url.startswith("http")]
167 else:
168 all_urls = [urljoin(source, url) for url in all_urls]
169
170 images = [
171 url
172 for url in all_urls
173 if any(url.endswith(ext) for ext in image_extensions)
174 ]
175 links = [url for url in all_urls if url not in images]
176
177 return links, images
178
179 def _clean_urls(self, urls: List[str]) -> List[str]:
180 """

Callers 1

executeMethod · 0.95

Calls 1

_clean_urlsMethod · 0.95

Tested by

no test coverage detected