MCPcopy
hub / github.com/Vchitect/Latte / clean_caption

Function clean_caption

utils.py:339–451  ·  view source on GitHub ↗
(caption)

Source from the content-addressed store, hash-verified

337
338# Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption
339def clean_caption(caption):
340 caption = str(caption)
341 caption = ul.unquote_plus(caption)
342 caption = caption.strip().lower()
343 caption = re.sub("<person>", "person", caption)
344 # urls:
345 caption = re.sub(
346 r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
347 "",
348 caption,
349 ) # regex for urls
350 caption = re.sub(
351 r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa
352 "",
353 caption,
354 ) # regex for urls
355 # html:
356 caption = BeautifulSoup(caption, features="html.parser").text
357
358 # @<nickname>
359 caption = re.sub(r"@[\w\d]+\b", "", caption)
360
361 # 31C0—31EF CJK Strokes
362 # 31F0—31FF Katakana Phonetic Extensions
363 # 3200—32FF Enclosed CJK Letters and Months
364 # 3300—33FF CJK Compatibility
365 # 3400—4DBF CJK Unified Ideographs Extension A
366 # 4DC0—4DFF Yijing Hexagram Symbols
367 # 4E00—9FFF CJK Unified Ideographs
368 caption = re.sub(r"[\u31c0-\u31ef]+", "", caption)
369 caption = re.sub(r"[\u31f0-\u31ff]+", "", caption)
370 caption = re.sub(r"[\u3200-\u32ff]+", "", caption)
371 caption = re.sub(r"[\u3300-\u33ff]+", "", caption)
372 caption = re.sub(r"[\u3400-\u4dbf]+", "", caption)
373 caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption)
374 caption = re.sub(r"[\u4e00-\u9fff]+", "", caption)
375 #######################################################
376
377 # все виды тире / all types of dash --> "-"
378 caption = re.sub(
379 r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa
380 "-",
381 caption,
382 )
383
384 # кавычки к одному стандарту
385 caption = re.sub(r"[`´«»“”¨]", '"', caption)
386 caption = re.sub(r"[‘’]", "'", caption)
387
388 # &quot;
389 caption = re.sub(r"&quot;?", "", caption)
390 # &amp
391 caption = re.sub(r"&amp", "", caption)
392
393 # ip adresses:
394 caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
395
396 # article ids:

Callers 1

processFunction · 0.85

Calls 1

subMethod · 0.80

Tested by

no test coverage detected