MCPcopy Index your code
hub / github.com/ScrapeGraphAI/Scrapegraph-ai / _clean_urls

Method _clean_urls

scrapegraphai/nodes/parse_node.py:179–204  ·  view source on GitHub ↗

Cleans the URLs extracted from the text. Args: urls (List[str]): The list of URLs to clean. Returns: List[str]: The cleaned URLs.

(self, urls: List[str])

Source from the content-addressed store, hash-verified

177 return links, images
178
179 def _clean_urls(self, urls: List[str]) -> List[str]:
180 """
181 Cleans the URLs extracted from the text.
182
183 Args:
184 urls (List[str]): The list of URLs to clean.
185
186 Returns:
187 List[str]: The cleaned URLs.
188 """
189 cleaned_urls = []
190 for url in urls:
191 if not ParseNode._is_valid_url(url):
192 url = re.sub(r".*?\]\(", "", url)
193 url = re.sub(r".*?\[\(", "", url)
194 url = re.sub(r".*?\[\)", "", url)
195 url = re.sub(r".*?\]\)", "", url)
196 url = re.sub(r".*?\)\[", "", url)
197 url = re.sub(r".*?\)\[", "", url)
198 url = re.sub(r".*?\(\]", "", url)
199 url = re.sub(r".*?\)\]", "", url)
200 url = url.rstrip(").-")
201 if len(url) > 0:
202 cleaned_urls.append(url)
203
204 return cleaned_urls
205
206 @staticmethod
207 def _is_valid_url(url: str) -> bool:

Callers 1

_extract_urlsMethod · 0.95

Calls 1

_is_valid_urlMethod · 0.80

Tested by

no test coverage detected