2022-11-12 06:02:43 +01:00
|
|
|
import bleach
|
2022-11-14 03:03:43 +01:00
|
|
|
from bleach.linkifier import LinkifyFilter
|
2022-11-12 06:02:43 +01:00
|
|
|
from django.utils.safestring import mark_safe
|
|
|
|
|
|
|
|
|
2022-11-14 03:03:43 +01:00
|
|
|
def allow_a(tag: str, name: str, value: str):
|
|
|
|
if name in ["href", "title", "class"]:
|
|
|
|
return True
|
|
|
|
elif name == "rel":
|
|
|
|
# Only allow rel attributes with a small subset of values
|
|
|
|
# (we're defending against, for example, rel=me)
|
|
|
|
rel_values = value.split()
|
|
|
|
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-11-12 06:02:43 +01:00
|
|
|
def sanitize_post(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Only allows a, br, p and span tags, and class attributes.
|
|
|
|
"""
|
2022-11-14 03:03:43 +01:00
|
|
|
cleaner = bleach.Cleaner(
|
2022-11-18 03:31:00 +01:00
|
|
|
tags=["br", "p"],
|
2022-11-14 03:03:43 +01:00
|
|
|
attributes={ # type:ignore
|
|
|
|
"a": allow_a,
|
|
|
|
"p": ["class"],
|
|
|
|
"span": ["class"],
|
|
|
|
},
|
|
|
|
filters=[LinkifyFilter],
|
2022-11-18 03:31:00 +01:00
|
|
|
strip=True,
|
2022-11-12 06:02:43 +01:00
|
|
|
)
|
2022-11-14 03:03:43 +01:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-22 05:18:13 +01:00
|
|
|
|
|
|
|
|
|
|
|
def strip_html(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Strips all tags from the text, then linkifies it.
|
|
|
|
"""
|
|
|
|
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[LinkifyFilter])
|
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-27 20:09:08 +01:00
|
|
|
|
|
|
|
|
|
|
|
def html_to_plaintext(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Tries to do the inverse of the linebreaks filter.
|
|
|
|
"""
|
|
|
|
# TODO: Handle HTML entities
|
|
|
|
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
|
|
|
|
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
|
|
|
|
# Remove all other HTML and return
|
|
|
|
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
|
|
|
|
return cleaner.clean(post_html).strip()
|