| 337 | |
| 338 | # Copied from diffusers.pipelines.deepfloyd_if.pipeline_if.IFPipeline._clean_caption |
| 339 | def clean_caption(caption): |
| 340 | caption = str(caption) |
| 341 | caption = ul.unquote_plus(caption) |
| 342 | caption = caption.strip().lower() |
| 343 | caption = re.sub("<person>", "person", caption) |
| 344 | # urls: |
| 345 | caption = re.sub( |
| 346 | r"\b((?:https?:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa |
| 347 | "", |
| 348 | caption, |
| 349 | ) # regex for urls |
| 350 | caption = re.sub( |
| 351 | r"\b((?:www:(?:\/{1,3}|[a-zA-Z0-9%])|[a-zA-Z0-9.\-]+[.](?:com|co|ru|net|org|edu|gov|it)[\w/-]*\b\/?(?!@)))", # noqa |
| 352 | "", |
| 353 | caption, |
| 354 | ) # regex for urls |
| 355 | # html: |
| 356 | caption = BeautifulSoup(caption, features="html.parser").text |
| 357 | |
| 358 | # @<nickname> |
| 359 | caption = re.sub(r"@[\w\d]+\b", "", caption) |
| 360 | |
| 361 | # 31C0—31EF CJK Strokes |
| 362 | # 31F0—31FF Katakana Phonetic Extensions |
| 363 | # 3200—32FF Enclosed CJK Letters and Months |
| 364 | # 3300—33FF CJK Compatibility |
| 365 | # 3400—4DBF CJK Unified Ideographs Extension A |
| 366 | # 4DC0—4DFF Yijing Hexagram Symbols |
| 367 | # 4E00—9FFF CJK Unified Ideographs |
| 368 | caption = re.sub(r"[\u31c0-\u31ef]+", "", caption) |
| 369 | caption = re.sub(r"[\u31f0-\u31ff]+", "", caption) |
| 370 | caption = re.sub(r"[\u3200-\u32ff]+", "", caption) |
| 371 | caption = re.sub(r"[\u3300-\u33ff]+", "", caption) |
| 372 | caption = re.sub(r"[\u3400-\u4dbf]+", "", caption) |
| 373 | caption = re.sub(r"[\u4dc0-\u4dff]+", "", caption) |
| 374 | caption = re.sub(r"[\u4e00-\u9fff]+", "", caption) |
| 375 | ####################################################### |
| 376 | |
| 377 | # все виды тире / all types of dash --> "-" |
| 378 | caption = re.sub( |
| 379 | r"[\u002D\u058A\u05BE\u1400\u1806\u2010-\u2015\u2E17\u2E1A\u2E3A\u2E3B\u2E40\u301C\u3030\u30A0\uFE31\uFE32\uFE58\uFE63\uFF0D]+", # noqa |
| 380 | "-", |
| 381 | caption, |
| 382 | ) |
| 383 | |
| 384 | # кавычки к одному стандарту |
| 385 | caption = re.sub(r"[`´«»“”¨]", '"', caption) |
| 386 | caption = re.sub(r"[‘’]", "'", caption) |
| 387 | |
| 388 | # " |
| 389 | caption = re.sub(r""?", "", caption) |
| 390 | # & |
| 391 | caption = re.sub(r"&", "", caption) |
| 392 | |
| 393 | # ip adresses: |
| 394 | caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption) |
| 395 | |
| 396 | # article ids: |