diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
index 40ce721a..2ca433a4 100644
--- a/.github/workflows/python.yaml
+++ b/.github/workflows/python.yaml
@@ -15,9 +15,9 @@ jobs:
- macos-latest
- windows-latest
steps:
- - uses: actions/checkout@v4
+ - uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v5
+ uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
diff --git a/lib/markdown2.py b/lib/markdown2.py
index dc698970..6518d783 100755
--- a/lib/markdown2.py
+++ b/lib/markdown2.py
@@ -862,6 +862,10 @@ def _detab(self, text: str) -> str:
output.append(self._detab_line(line))
return '\n'.join(output)
+ # https://developer.mozilla.org/en-US/docs/Glossary/Void_element
+ # technically "self closing tags" (eg:
) are not real HTML but noone cares
+ _void_tags = 'area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'
+
# I broke out the html5 tags here and add them to _block_tags_a and
# _block_tags_b. This way html5 tags are easy to keep track of.
_html5tags = '|address|article|aside|canvas|figcaption|figure|footer|header|main|nav|section|video'
@@ -906,6 +910,7 @@ def _detab(self, text: str) -> str:
_html_markdown_attr_re = re.compile(
# markdown attr, with optional assignment to true, must be followed by whitespace/boundary/closing tag chars
r'''\s+markdown(?:="1"|='1'|=1)?(?![^\s/>\b])''')
+
def _hash_html_block_sub(
self,
match: Union[re.Match[str], str],
@@ -1104,16 +1109,17 @@ def _strict_tag_block_sub(
block += chunk
if is_markup:
- if chunk.startswith('%s' % is_markup.group(1)):
- tag_count -= 1
+ if self._tag_is_closed(is_markup.group(3), chunk):
+ # if close tag is in same line we must ignore these
+ is_markup = None
else:
- # if close tag is in same line
- if self._tag_is_closed(is_markup.group(3), chunk):
- # we must ignore these
- is_markup = None
- else:
- tag_count += 1
- current_tag = is_markup.group(3)
+ # add up all the open/close tags possibly in the same line and add that to the total
+ current_tag = is_markup.group(3)
+ tag_count += self._tag_imbalance(current_tag, chunk)
+ elif current_tag != html_tags_re and current_tag in chunk:
+ # if we're looking for a specific tag then check for any opens/closes later on in the
+ # line that may throw off our count
+ tag_count += self._tag_imbalance(current_tag, chunk)
if tag_count == 0:
if is_markup:
@@ -1127,6 +1133,9 @@ def _strict_tag_block_sub(
return result
def _tag_is_closed(self, tag_name: str, text: str) -> bool:
+ if re.match(self._void_tags, tag_name):
+ return True
+
# check if number of open tags == number of close tags
if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != text.count('%s>' % tag_name):
return False
@@ -1136,6 +1145,29 @@ def _tag_is_closed(self, tag_name: str, text: str) -> bool:
open_index = text.find(f'<{tag_name}')
return open_index != -1 and close_index != -1 and open_index < close_index
+ def _tag_imbalance(self, tag_name: str, text: str) -> int:
+ '''
+ Find imbalanced HTML tags in some text
+
+ Args:
+ tag_name: the name of the tag (eg: "ul")
+ text: the text to search
+
+ Returns:
+ 0 for balanced tags, positive int for more opening tags than closing, negative int for
+ more closing tags than opening
+ '''
+ if re.match(self._void_tags, tag_name):
+ return 0
+
+ count = 0
+ for tag in re.finditer(r'<(/)?%s\b>?' % tag_name, text):
+ if tag.group(1):
+ count -= 1
+ else:
+ count += 1
+ return count
+
@mark_stage(Stage.LINK_DEFS)
def _strip_link_definitions(self, text: str) -> str:
# Strips link definitions from text, stores the URLs and titles in
@@ -1421,13 +1453,13 @@ def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
'''
orig = ''
while text != orig:
+ orig = text
if spans:
for key, sanitized in list(self.html_spans.items()):
text = text.replace(key, sanitized)
if code:
for code, key in list(self._code_table.items()):
text = text.replace(key, code)
- orig = text
return text
def _sanitize_html(self, s: str) -> str:
@@ -1518,6 +1550,12 @@ def _protect_url(self, url: str) -> str:
mime = data_url.group('mime') or ''
if mime.startswith('image/') and data_url.group('token') == ';base64':
charset='base64'
+ else:
+ url = (
+ self._unhash_html_spans(url, code=True)
+ .replace('*', self._escape_table['*'])
+ .replace('_', self._escape_table['_'])
+ )
url = _html_escape_url(url, safe_mode=self.safe_mode, charset=charset)
key = _hash_text(url)
self._escape_table[url] = key
@@ -1537,8 +1575,10 @@ def _safe_href(self):
safe = r'-\w'
# omitted ['"<>] for XSS reasons
less_safe = r'#/\.!#$%&\(\)\+,/:;=\?@\[\]^`\{\}\|~'
+ # html encoded colon in a URL still functions as a normal colon, so need to detect those
+ protocol_seperators = [':', ':', ':', ':']
# dot seperated hostname, optional port number, not followed by protocol seperator
- domain = r'(?:[{}]+(?:\.[{}]+)*)(?:(?
+
+
+
+) <script>alert(origin)</script>
+
+"
diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.opts b/test/tm-cases/improper_void_tag_hashing_pr705.opts
new file mode 100644
index 00000000..ad487c04
--- /dev/null
+++ b/test/tm-cases/improper_void_tag_hashing_pr705.opts
@@ -0,0 +1 @@
+{"safe_mode": "escape"}
diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.text b/test/tm-cases/improper_void_tag_hashing_pr705.text
new file mode 100644
index 00000000..223862cf
--- /dev/null
+++ b/test/tm-cases/improper_void_tag_hashing_pr705.text
@@ -0,0 +1,10 @@
+---
+* ```
+ * ```
+
+ x
+```
+---
+```) ```
+"
+---
diff --git a/test/tm-cases/malformed_html_crash_issue584.html b/test/tm-cases/malformed_html_crash_issue584.html
index e2071f84..00f32cdb 100644
--- a/test/tm-cases/malformed_html_crash_issue584.html
+++ b/test/tm-cases/malformed_html_crash_issue584.html
@@ -1,3 +1,3 @@
-
-
+
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.html b/test/tm-cases/xss_from_incorrect_block_hashing.html
new file mode 100644
index 00000000..db2fe827
--- /dev/null
+++ b/test/tm-cases/xss_from_incorrect_block_hashing.html
@@ -0,0 +1,14 @@
+
+
+
+
+[x](")}<img src="x" onerror="alert(origin)">
+
+
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.opts b/test/tm-cases/xss_from_incorrect_block_hashing.opts
new file mode 100644
index 00000000..54de31a8
--- /dev/null
+++ b/test/tm-cases/xss_from_incorrect_block_hashing.opts
@@ -0,0 +1 @@
+{"safe_mode": "escape"}
\ No newline at end of file
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.text b/test/tm-cases/xss_from_incorrect_block_hashing.text
new file mode 100644
index 00000000..d0770a4a
--- /dev/null
+++ b/test/tm-cases/xss_from_incorrect_block_hashing.text
@@ -0,0 +1,5 @@
+- [x]
+ 1. - [x]
+___
+[x](`")}
+___
diff --git a/test/tm-cases/xss_smuggling_spans_in_image_attrs.html b/test/tm-cases/xss_smuggling_spans_in_image_attrs.html
index 37cd276e..ed1ca655 100644
--- a/test/tm-cases/xss_smuggling_spans_in_image_attrs.html
+++ b/test/tm-cases/xss_smuggling_spans_in_image_attrs.html
@@ -3,3 +3,21 @@
![<code>" onerror="alert(1)//</code>]()

+
+x
+
+x
+
+
+-
+
+
onerror=alert(origin) )
+
+
+
+"></code)
+
+http://onclick=alert(origin)//
+
+">`)
diff --git a/test/tm-cases/xss_smuggling_spans_in_image_attrs.text b/test/tm-cases/xss_smuggling_spans_in_image_attrs.text
index 4a5c25a8..bee50136 100644
--- a/test/tm-cases/xss_smuggling_spans_in_image_attrs.text
+++ b/test/tm-cases/xss_smuggling_spans_in_image_attrs.text
@@ -2,4 +2,19 @@
![`" onerror="alert(1)//`]()
-
\ No newline at end of file
+
+
+[x](javascript:alert(origin))
+
+[x](javascript:1/alert(origin))
+
+-
+-  onerror=alert(origin) )
+
+
+
+
+
+![x](<"`"![x][id]
+[id]: x "`