hub / github.com/ScrapeGraphAI/Scrapegraph-ai / _extract_urls

Method _extract_urls

scrapegraphai/nodes/parse_node.py:131–177 · view source on GitHub ↗

Extracts URLs from the given text. Args: text (str): The text to extract URLs from. Returns: Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.

(self, text: str, source: str)

Source from the content-addressed store, hash-verified

129	return state
130
131	def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
132	"""
133	Extracts URLs from the given text.
134
135	Args:
136	text (str): The text to extract URLs from.
137
138	Returns:
139	Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
140	"""
141	if not self.parse_urls:
142	return [], []
143
144	image_extensions = default_filters.filter_dict["img_exts"]
145	url = ""
146	all_urls = set()
147
148	for group in ParseNode.url_pattern.findall(text):
149	for el in group:
150	if el != "":
151	url += el
152	all_urls.add(url)
153	url = ""
154
155	url = ""
156	for group in ParseNode.relative_url_pattern.findall(text):
157	for el in group:
158	if el not in ["", "[", "]", "(", ")", "{", "}"]:
159	url += el
160	all_urls.add(urljoin(source, url))
161	url = ""
162
163	all_urls = list(all_urls)
164	all_urls = self._clean_urls(all_urls)
165	if not source.startswith("http"):
166	all_urls = [url for url in all_urls if url.startswith("http")]
167	else:
168	all_urls = [urljoin(source, url) for url in all_urls]
169
170	images = [
171	url
172	for url in all_urls
173	if any(url.endswith(ext) for ext in image_extensions)
174	]
175	links = [url for url in all_urls if url not in images]
176
177	return links, images
178
179	def _clean_urls(self, urls: List[str]) -> List[str]:
180	"""

Callers 1

executeMethod · 0.95

Calls 1

_clean_urlsMethod · 0.95

Tested by

no test coverage detected