Method _clean_urls

scrapegraphai/nodes/parse_node.py:179–204 · view source on GitHub ↗

Cleans the URLs extracted from the text. Args: urls (List[str]): The list of URLs to clean. Returns: List[str]: The cleaned URLs.

(self, urls: List[str])

Source from the content-addressed store, hash-verified

177	return links, images
178
179	def _clean_urls(self, urls: List[str]) -> List[str]:
180	"""
181	Cleans the URLs extracted from the text.
182
183	Args:
184	urls (List[str]): The list of URLs to clean.
185
186	Returns:
187	List[str]: The cleaned URLs.
188	"""
189	cleaned_urls = []
190	for url in urls:
191	if not ParseNode._is_valid_url(url):
192	url = re.sub(r".*?\]\(", "", url)
193	url = re.sub(r".*?\[\(", "", url)
194	url = re.sub(r".*?\[\)", "", url)
195	url = re.sub(r".*?\]\)", "", url)
196	url = re.sub(r".*?\)\[", "", url)
197	url = re.sub(r".*?\)\[", "", url)
198	url = re.sub(r".*?\(\]", "", url)
199	url = re.sub(r".*?\)\]", "", url)
200	url = url.rstrip(").-")
201	if len(url) > 0:
202	cleaned_urls.append(url)
203
204	return cleaned_urls
205
206	@staticmethod
207	def _is_valid_url(url: str) -> bool:

_extract_urlsMethod · 0.95

_is_valid_urlMethod · 0.80

no test coverage detected