Skip to content

Commit 8ffdbea

Browse files
committed
Refactor HTML sanitization in forms and views; remove sanitizer dependency and implement custom sanitizer
1 parent 83032be commit 8ffdbea

File tree

8 files changed

+347
-28
lines changed

8 files changed

+347
-28
lines changed

events/forms.py

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
from django.conf import settings
55
from django_summernote.widgets import SummernoteInplaceWidget
66
from django.utils.translation import gettext_lazy as _
7-
8-
from sanitizer.forms import SanitizedCharField
9-
7+
from .html_sanitizer import sanitize_html
108
from .models import Event, EventParticipation
119
from .mixins import CrispyFormMixin, ReadOnlyFieldsMixin
1210

@@ -21,11 +19,11 @@
2119

2220
class EventForm(CrispyFormMixin):
2321

24-
description = SanitizedCharField(
25-
allowed_tags=settings.ALLOWED_HTML_TAGS_INPUT,
26-
allowed_attributes=settings.ALLOWED_HTML_ATTRIBUTES_INPUT,
27-
allowed_styles=settings.ALLOWED_HTML_STYLES_INPUT,
28-
strip=False, widget=SummernoteInplaceWidget())
22+
description = forms.CharField(
23+
widget=SummernoteInplaceWidget(),
24+
required=False,
25+
label=_("Descripción")
26+
)
2927

3028
start_at = forms.SplitDateTimeField(
3129
required=True,
@@ -72,15 +70,30 @@ class Meta:
7270
'has_sponsors': HAS_SPONSORS_HELP_TEXT,
7371
}
7472

73+
def clean_description(self):
74+
"""
75+
Clean and sanitize the 'description' field using the allowed tags/attrs/styles
76+
from Django settings.
77+
"""
78+
raw_description = self.cleaned_data.get('description', '')
79+
safe_html = sanitize_html(
80+
html=raw_description,
81+
allowed_tags=settings.ALLOWED_HTML_TAGS_INPUT,
82+
allowed_attrs=settings.ALLOWED_HTML_ATTRIBUTES_INPUT,
83+
allowed_styles=settings.ALLOWED_HTML_STYLES_INPUT
84+
)
85+
return safe_html
86+
7587
def clean(self):
7688
cleaned_data = super().clean()
7789
start_at = cleaned_data.get('start_at')
7890
end_at = cleaned_data.get('end_at')
79-
if start_at is not None and end_at is not None:
80-
if start_at > end_at:
81-
msg = 'La fecha de inicio es menor a la fecha de finalizacion'
82-
self._errors['start_at'] = [_(msg)]
83-
self._errors['end_at'] = [_(msg)]
91+
92+
if start_at and end_at and start_at > end_at:
93+
msg = 'La fecha de inicio es menor a la fecha de finalizacion'
94+
self._errors['start_at'] = [_(msg)]
95+
self._errors['end_at'] = [_(msg)]
96+
8497
return cleaned_data
8598

8699
def save(self, *args, **kwargs):

events/html_sanitizer.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
from bs4 import BeautifulSoup
2+
3+
from urllib.parse import urlparse
4+
5+
from django.conf import settings
6+
7+
8+
def is_safe_url(url: str, attr: str) -> bool:
9+
"""
10+
Check if the URL uses an allowed scheme for the given attribute.
11+
12+
Args:
13+
url (str): The URL to validate.
14+
attr (str): The attribute name (e.g., 'href', 'src').
15+
16+
Returns:
17+
bool: True if the URL is safe, False otherwise.
18+
"""
19+
parsed = urlparse(url)
20+
if parsed.scheme == "":
21+
# URLs without a scheme are considered safe (relative URLs)
22+
return True
23+
allowed_schemes = settings.ALLOWED_URL_SCHEMES.get(attr, [])
24+
return parsed.scheme.lower() in allowed_schemes
25+
26+
27+
def remove_disallowed_tags(soup: BeautifulSoup, allowed_tags: list[str]) -> None:
28+
"""
29+
Remove any tags that are not present in allowed_tags.
30+
"""
31+
for tag in soup.find_all():
32+
if tag.name not in allowed_tags:
33+
tag.decompose()
34+
35+
36+
def filter_style_attribute(style_value: str, allowed_styles: list[str]) -> str:
37+
"""
38+
Take a CSS style string (e.g. "color: red; font-weight: bold;") and
39+
return a sanitized version containing only the allowed properties.
40+
Returns an empty string if no allowed properties remain.
41+
"""
42+
filtered_style_pairs = []
43+
for prop_pair in style_value.split(";"):
44+
prop_pair = prop_pair.strip()
45+
if not prop_pair:
46+
continue
47+
if ":" not in prop_pair:
48+
continue
49+
50+
prop_name, prop_value = prop_pair.split(":", 1)
51+
prop_name = prop_name.strip().lower()
52+
prop_value = prop_value.strip()
53+
54+
if prop_name in allowed_styles:
55+
filtered_style_pairs.append(f"{prop_name}: {prop_value}")
56+
57+
return "; ".join(filtered_style_pairs)
58+
59+
60+
def filter_attributes(
61+
soup: BeautifulSoup, allowed_attrs: list[str], allowed_styles: list[str]
62+
) -> None:
63+
"""
64+
For each remaining (allowed) tag in the soup, remove any attribute
65+
not in allowed_attrs. If the attribute is 'style', filter out
66+
disallowed CSS properties.
67+
"""
68+
for tag in soup.find_all():
69+
for attr_name in list(tag.attrs):
70+
# If attribute name is not allowed, remove it
71+
if attr_name not in allowed_attrs:
72+
del tag.attrs[attr_name]
73+
# If it's a style attribute, apply style filtering
74+
elif attr_name == "style":
75+
style_value = tag.attrs["style"]
76+
safe_style = filter_style_attribute(style_value, allowed_styles)
77+
if safe_style:
78+
tag.attrs["style"] = safe_style
79+
else:
80+
# If no allowed properties remain, remove the style attribute
81+
del tag.attrs["style"]
82+
elif attr_name in settings.ALLOWED_URL_SCHEMES:
83+
# Validate URL schemes for attributes like 'href' and 'src'
84+
url = tag.attrs[attr_name]
85+
if not is_safe_url(url, attr_name):
86+
del tag.attrs[attr_name]
87+
88+
89+
def sanitize_html(
90+
html: str,
91+
allowed_tags: list[str],
92+
allowed_attrs: list[str],
93+
allowed_styles: list[str],
94+
) -> str:
95+
"""
96+
Main function that orchestrates the sanitization process.
97+
"""
98+
soup = BeautifulSoup(html, "html.parser")
99+
100+
# 1. Remove disallowed tags entirely
101+
remove_disallowed_tags(soup, allowed_tags)
102+
103+
# 2. Filter attributes (including style) on the remaining tags
104+
filter_attributes(soup, allowed_attrs, allowed_styles)
105+
106+
# Return the resulting sanitized HTML
107+
return str(soup)
Lines changed: 182 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,182 @@
1+
from django.test import TestCase, override_settings
2+
from events.html_sanitizer import sanitize_html
3+
from django.conf import settings
4+
5+
6+
class SanitizeHTMLTests(TestCase):
7+
def setUp(self):
8+
# Define allowed tags, attributes, and styles from settings
9+
self.allowed_tags = settings.ALLOWED_HTML_TAGS_INPUT
10+
self.allowed_attrs = settings.ALLOWED_HTML_ATTRIBUTES_INPUT
11+
self.allowed_styles = settings.ALLOWED_HTML_STYLES_INPUT
12+
13+
def test_allowed_tags_preserved(self):
14+
"""Ensure that allowed tags are preserved in the output."""
15+
input_html = "<p>This is a <strong>test</strong> paragraph.</p>"
16+
expected_output = "<p>This is a <strong>test</strong> paragraph.</p>"
17+
sanitized = sanitize_html(
18+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
19+
)
20+
self.assertEqual(sanitized, expected_output)
21+
22+
def test_disallowed_tags_removed(self):
23+
"""Ensure that disallowed tags are removed from the output."""
24+
input_html = "<p>This is a <script>alert('XSS');</script> test.</p>"
25+
expected_output = "<p>This is a test.</p>"
26+
sanitized = sanitize_html(
27+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
28+
)
29+
self.assertEqual(sanitized, expected_output)
30+
31+
def test_allowed_attributes_preserved(self):
32+
"""Ensure that allowed attributes are preserved."""
33+
input_html = '<a href="https://example.com" title="Example">Link</a>'
34+
expected_output = '<a href="https://example.com" title="Example">Link</a>'
35+
sanitized = sanitize_html(
36+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
37+
)
38+
self.assertEqual(sanitized, expected_output)
39+
40+
def test_disallowed_attributes_removed(self):
41+
"""Ensure that disallowed attributes are removed."""
42+
input_html = '<a href="https://example.com" onclick="alert(\'XSS\')">Link</a>'
43+
expected_output = '<a href="https://example.com">Link</a>'
44+
sanitized = sanitize_html(
45+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
46+
)
47+
self.assertEqual(sanitized, expected_output)
48+
49+
def test_allowed_styles_preserved(self):
50+
"""Ensure that allowed CSS styles are preserved."""
51+
input_html = '<p style="color: red; font-weight: bold;">Styled text</p>'
52+
expected_output = '<p style="color: red; font-weight: bold">Styled text</p>'
53+
sanitized = sanitize_html(
54+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
55+
)
56+
self.assertEqual(sanitized, expected_output)
57+
58+
def test_malformed_html(self):
59+
"""Ensure that malformed HTML is handled gracefully."""
60+
input_html = "<p>This is <b>bold<i> and italic</p>"
61+
expected_output = "<p>This is <b>bold<i> and italic</i></b></p>"
62+
sanitized = sanitize_html(
63+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
64+
)
65+
self.assertEqual(sanitized, expected_output)
66+
67+
def test_escape_entities(self):
68+
"""Ensure that HTML entities are preserved."""
69+
input_html = "5 &lt; 10 &amp; 10 &gt; 5"
70+
expected_output = "5 &lt; 10 &amp; 10 &gt; 5" # If using convert_charrefs=False
71+
sanitized = sanitize_html(
72+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
73+
)
74+
self.assertEqual(sanitized, expected_output)
75+
76+
def test_href_sanitization(self):
77+
"""Ensure that href attributes do not contain 'javascript:'."""
78+
input_html = "<a href=\"javascript:alert('XSS')\">Bad Link</a>"
79+
expected_output = "<a>Bad Link</a>" # 'href' removed due to unsafe scheme
80+
sanitized = sanitize_html(
81+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
82+
)
83+
self.assertEqual(sanitized, expected_output)
84+
85+
def test_style_with_malicious_content(self):
86+
"""Ensure that style attributes do not contain malicious content."""
87+
input_html = (
88+
"<p style=\"color: red; background-image: url('javascript:alert(1)');\">Test</p>"
89+
)
90+
expected_output = '<p style="color: red">Test</p>'
91+
92+
sanitized = sanitize_html(
93+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
94+
)
95+
self.assertEqual(sanitized, expected_output)
96+
97+
def test_completely_malicious_input(self):
98+
"""Ensure that completely malicious input is sanitized appropriately."""
99+
input_html = '<img src="x" onerror="alert(1)" /><script>alert("XSS")</script>'
100+
expected_output = '<img src="x"/>'
101+
102+
sanitized = sanitize_html(
103+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
104+
)
105+
self.assertEqual(sanitized, expected_output)
106+
107+
def test_preserve_allowed_and_remove_disallowed_mixed(self):
108+
"""Ensure that allowed and disallowed elements are correctly handled when mixed."""
109+
input_html = """
110+
<p>This is a <strong>strong</strong> and <em>emphasized</em> text.</p>
111+
<script>alert('XSS');</script>
112+
<a href="https://example.com" onclick="stealCookies()">Example Link</a>
113+
<div style="color: green; font-size: 12px;">Div content</div>
114+
"""
115+
expected_output = """
116+
<p>This is a <strong>strong</strong> and <em>emphasized</em> text.</p>
117+
118+
<a href="https://example.com">Example Link</a>
119+
120+
<div style="color: green; font-size: 12px">Div content</div>
121+
""".strip()
122+
123+
sanitized = sanitize_html(
124+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
125+
)
126+
127+
# Normalize spaces:
128+
sanitized = " ".join(sanitized.split())
129+
expected_output = " ".join(expected_output.split())
130+
131+
self.assertEqual(sanitized, expected_output)
132+
133+
def test_allow_links_with_safe_href(self):
134+
"""Ensure that links with safe href attributes are preserved."""
135+
input_html = '<a href="https://example.com" title="Example">Visit Example</a>'
136+
expected_output = (
137+
'<a href="https://example.com" title="Example">Visit Example</a>'
138+
)
139+
sanitized = sanitize_html(
140+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
141+
)
142+
self.assertEqual(sanitized, expected_output)
143+
144+
def test_allow_empty_style_attribute(self):
145+
"""Ensure that empty style attributes are removed."""
146+
input_html = '<p style="">No styles</p>'
147+
expected_output = "<p>No styles</p>"
148+
sanitized = sanitize_html(
149+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
150+
)
151+
self.assertEqual(sanitized, expected_output)
152+
153+
def test_preserve_text_only(self):
154+
"""Ensure that plain text without HTML remains unchanged."""
155+
input_html = "Just plain text without HTML."
156+
expected_output = "Just plain text without HTML."
157+
sanitized = sanitize_html(
158+
input_html, self.allowed_tags, self.allowed_attrs, self.allowed_styles
159+
)
160+
self.assertEqual(sanitized, expected_output)
161+
162+
@override_settings(
163+
ALLOWED_HTML_TAGS_INPUT=["p", "a"],
164+
ALLOWED_HTML_ATTRIBUTES_INPUT=["href"],
165+
ALLOWED_HTML_STYLES_INPUT=[],
166+
)
167+
def test_dynamic_allowed_tags(self):
168+
"""Ensure that sanitizer uses dynamically overridden settings."""
169+
input_html = (
170+
"<p>Paragraph with"
171+
'<a href="https://example.com" title="Example">link</a>'
172+
"and <b>bold</b>.</p>"
173+
)
174+
# With settings overridden, 'a' allows 'href' only, 'b' is disallowed
175+
expected = '<p>Paragraph with <a href="https://example.com">link</a> and .</p>'
176+
sanitized = sanitize_html(
177+
html=input_html,
178+
allowed_tags=settings.ALLOWED_HTML_TAGS_INPUT,
179+
allowed_attrs=settings.ALLOWED_HTML_ATTRIBUTES_INPUT,
180+
allowed_styles=settings.ALLOWED_HTML_STYLES_INPUT,
181+
)
182+
self.assertEqual(sanitized, expected)

events/tests/test_views.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
import bleach
2-
31
from django.test import TestCase, Client
42
from django.urls import reverse
53

@@ -89,7 +87,7 @@ def test_html_sanitizer_in_description_field(self):
8987
self.assertEqual(response.status_code, 302)
9088
self.assertEqual(Event.objects.filter(name='PyDay San Rafael').count(), 1)
9189
event = Event.objects.filter(name='PyDay San Rafael').get()
92-
self.assertEqual(event.description, bleach.clean('an <script>evil()</script> example'))
90+
self.assertEqual(event.description, ('an example'))
9391

9492
def test_events_view_delete(self):
9593
event = EventFactory(owner=self.user)

0 commit comments

Comments
 (0)