trentm · Crozzers · May 14, 2026 · May 14, 2026 · May 14, 2026 · May 23, 2026
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -15,9 +15,9 @@ jobs:
           - macos-latest
           - windows-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v6
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies

diff --git a/lib/markdown2.py b/lib/markdown2.py
@@ -862,6 +862,10 @@ def _detab(self, text: str) -> str:
             output.append(self._detab_line(line))
         return '\n'.join(output)
 
+    # https://developer.mozilla.org/en-US/docs/Glossary/Void_element
+    # technically "self closing tags" (eg: <hr />) are not real HTML but noone cares
+    _void_tags = 'area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'
+
     # I broke out the html5 tags here and add them to _block_tags_a and
     # _block_tags_b.  This way html5 tags are easy to keep track of.
     _html5tags = '|address|article|aside|canvas|figcaption|figure|footer|header|main|nav|section|video'
@@ -906,6 +910,7 @@ def _detab(self, text: str) -> str:
     _html_markdown_attr_re = re.compile(
         # markdown attr, with optional assignment to true, must be followed by whitespace/boundary/closing tag chars
         r'''\s+markdown(?:="1"|='1'|=1)?(?![^\s/>\b])''')
+
     def _hash_html_block_sub(
         self,
         match: Union[re.Match[str], str],
@@ -1104,16 +1109,17 @@ def _strict_tag_block_sub(
             block += chunk
 
             if is_markup:
-                if chunk.startswith('%s</' % is_markup.group(1)):
-                    tag_count -= 1
+                if self._tag_is_closed(is_markup.group(3), chunk):
+                    # if close tag is in same line we must ignore these
+                    is_markup = None
                 else:
-                    # if close tag is in same line
-                    if self._tag_is_closed(is_markup.group(3), chunk):
-                        # we must ignore these
-                        is_markup = None
-                    else:
-                        tag_count += 1
-                        current_tag = is_markup.group(3)
+                    # add up all the open/close tags possibly in the same line and add that to the total
+                    current_tag = is_markup.group(3)
+                    tag_count += self._tag_imbalance(current_tag, chunk)
+            elif current_tag != html_tags_re and current_tag in chunk:
+                # if we're looking for a specific tag then check for any opens/closes later on in the
+                # line that may throw off our count
+                tag_count += self._tag_imbalance(current_tag, chunk)
 
             if tag_count == 0:
                 if is_markup:
@@ -1127,6 +1133,9 @@ def _strict_tag_block_sub(
         return result
 
     def _tag_is_closed(self, tag_name: str, text: str) -> bool:
+        if re.match(self._void_tags, tag_name):
+            return True
+
         # check if number of open tags == number of close tags
         if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != text.count('</%s>' % tag_name):
             return False
@@ -1136,6 +1145,29 @@ def _tag_is_closed(self, tag_name: str, text: str) -> bool:
         open_index = text.find(f'<{tag_name}')
         return open_index != -1 and close_index != -1 and open_index < close_index
 
+    def _tag_imbalance(self, tag_name: str, text: str) -> int:
+        '''
+        Find imbalanced HTML tags in some text
+
+        Args:
+            tag_name: the name of the tag (eg: "ul")
+            text: the text to search
+
+        Returns:
+            0 for balanced tags, positive int for more opening tags than closing, negative int for
+            more closing tags than opening
+        '''
+        if re.match(self._void_tags, tag_name):
+            return 0
+
+        count = 0
+        for tag in re.finditer(r'<(/)?%s\b>?' % tag_name, text):
+            if tag.group(1):
+                count -= 1
+            else:
+                count += 1
+        return count
+
     @mark_stage(Stage.LINK_DEFS)
     def _strip_link_definitions(self, text: str) -> str:
         # Strips link definitions from text, stores the URLs and titles in
@@ -1421,13 +1453,13 @@ def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
         '''
         orig = ''
         while text != orig:
+            orig = text
             if spans:
                 for key, sanitized in list(self.html_spans.items()):
                     text = text.replace(key, sanitized)
             if code:
                 for code, key in list(self._code_table.items()):
                     text = text.replace(key, code)
-            orig = text
         return text
 
     def _sanitize_html(self, s: str) -> str:
@@ -1518,6 +1550,12 @@ def _protect_url(self, url: str) -> str:
             mime = data_url.group('mime') or ''
             if mime.startswith('image/') and data_url.group('token') == ';base64':
                 charset='base64'
+        else:
+            url = (
+                self._unhash_html_spans(url, code=True)
+                .replace('*', self._escape_table['*'])
+                .replace('_', self._escape_table['_'])
+            )
         url = _html_escape_url(url, safe_mode=self.safe_mode, charset=charset)
         key = _hash_text(url)
         self._escape_table[url] = key
@@ -1537,8 +1575,10 @@ def _safe_href(self):
         safe = r'-\w'
         # omitted ['"<>] for XSS reasons
         less_safe = r'#/\.!#$%&\(\)\+,/:;=\?@\[\]^`\{\}\|~'
+        # html encoded colon in a URL still functions as a normal colon, so need to detect those
+        protocol_seperators = [':', '&#x3a;', '&#58;', '&colon;']
         # dot seperated hostname, optional port number, not followed by protocol seperator
-        domain = r'(?:[{}]+(?:\.[{}]+)*)(?:(?<!tel):\d+/?)?(?![^:/]*:/*)'.format(safe, safe)
+        domain = r'(?:[{}]+(?:\.[{}]+)*)(?:(?<!tel)(?<!javascript):\d+/?)?(?![^:/]*(?:{})/*)'.format(safe, safe, '|'.join(protocol_seperators))
         fragment = r'[%s]*' % (safe + less_safe)
 
         return re.compile(r'^(?:({})?({})({})|(#|\.{{,2}}/)({}))$'.format(self._safe_protocols, domain, fragment, fragment), re.I)
@@ -3201,6 +3241,14 @@ def run(self, text: str):
                 link_text = self.md._hash_html_spans(link_text)
                 link_text = self.md._unhash_html_spans(link_text)
 
+            # check that this link is not inside an autolink
+            if any(
+                autolink.start() < start_idx < p < autolink.end()
+                for autolink in self.md._auto_link_re.finditer(text)
+            ):
+                curr_pos = start_idx + 1
+                continue
+
             # Possibly a footnote ref?
             if "footnotes" in self.md.extras and link_text.startswith("^"):
                 normed_id = re.sub(r'\W', '-', link_text[1:])
@@ -3234,7 +3282,6 @@ def run(self, text: str):
                     continue
 
                 text, url, title, url_end_idx = parsed
-                url = self.md._unhash_html_spans(url, code=True)
             # reference anchor or reference img
             else:
                 if not self.options.get('ref', True):
@@ -3253,13 +3300,6 @@ def run(self, text: str):
                     curr_pos = p
                     continue
 
-            # -- Encode and hash the URL and title to avoid conflicts with italics/bold
-
-            url = (
-                url
-                .replace('*', self.md._escape_table['*'])
-                .replace('_', self.md._escape_table['_'])
-            )
             if title:
                 if self.md.safe_mode:
                     # expose span contents for escaping - fix #691, #703
@@ -3269,6 +3309,8 @@ def run(self, text: str):
                     .replace('*', self.md._escape_table['*'])
                     .replace('_', self.md._escape_table['_'])
                 )
+                if self.md.safe_mode:
+                    title = self.md._hash_span(title)
                 title_str = f' title="{title}"'
             else:
                 title_str = ''

diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.html b/test/tm-cases/improper_void_tag_hashing_pr705.html
@@ -0,0 +1,17 @@
+<hr />
+
+<ul>
+<li><p>```</p>
+
+<ul>
+<li>```</li>
+</ul>
+
+<p>x</p>
+
+<h2>```</h2></li>
+</ul>
+
+<p><code>) &lt;script&gt;alert(origin)&lt;/script&gt;</code></p>
+
+<h2>"</h2>
diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.opts b/test/tm-cases/improper_void_tag_hashing_pr705.opts
@@ -0,0 +1 @@
+{"safe_mode": "escape"}
diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.text b/test/tm-cases/improper_void_tag_hashing_pr705.text
@@ -0,0 +1,10 @@
+---
+* ```
+    * ```
+
+	x
+```
+---
+```) <script>alert(origin)</script>```
+"
+---
diff --git a/test/tm-cases/malformed_html_crash_issue584.html b/test/tm-cases/malformed_html_crash_issue584.html
@@ -1,3 +1,3 @@
-</p
->
-<pre>
+<p></p
+&gt;
+<pre></p>
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.html b/test/tm-cases/xss_from_incorrect_block_hashing.html
@@ -0,0 +1,14 @@
+<ul>
+<li>[x]
+<ol>
+<li><ul>
+<li>[x]</li>
+</ul></li>
+</ol></li>
+</ul>
+
+<hr />
+
+<p>[x](<code>")}&lt;img src="x</code>" onerror="alert(origin)"></p>
+
+<hr />
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.opts b/test/tm-cases/xss_from_incorrect_block_hashing.opts
@@ -0,0 +1 @@
+{"safe_mode": "escape"}
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.text b/test/tm-cases/xss_from_incorrect_block_hashing.text
@@ -0,0 +1,5 @@
+- [x]
+   1. - [x]
+___
+[x](`")}<img src="x`" onerror="alert(origin)">
+___
diff --git a/test/tm-cases/xss_smuggling_spans_in_image_attrs.html b/test/tm-cases/xss_smuggling_spans_in_image_attrs.html
@@ -3,3 +3,21 @@
 <p><img src="" alt="&lt;code&gt;&quot; onerror=&quot;alert(1)//&lt;/code&gt;" /></p>
 
 <p><img src="B" alt="A" title="&lt;C D=&quot;E&quot; onerror=alert(origin) &gt;" /></p>
+
+<p><a href="#">x</a></p>
+
+<p><a href="#">x</a></p>
+
+<ul>
+<li>
+<ul>
+<li><img src="x" alt="" title="&lt;code&gt;![](&lt;/code&gt;" /> onerror=alert(origin) )</li>
+</ul></li>
+</ul>
+
+<p><img src="code&gt;&lt;A B=&quot;
+&quot; onerror=&quot;alert(origin)&quot;&gt;&lt;/code" alt="" /></p>
+
+<p><a href="http://onclick=alert(origin)//![](x)">http://onclick=alert(origin)//![](x)</a></p>
+
+<p>![x](&lt;"`"<img src="x &quot;&lt;A B=&quot;&quot; onerror=&quot;alert(origin)&quot;&gt;`" alt="x" /></p>
diff --git a/test/tm-cases/xss_smuggling_spans_in_image_attrs.text b/test/tm-cases/xss_smuggling_spans_in_image_attrs.text
@@ -2,4 +2,19 @@
 
 ![`" onerror="alert(1)//`]()
 
-![A](B "<C D="E" onerror=alert(origin) >")
+![A](B "<C D="E" onerror=alert(origin) >")
+
+[x](javascript&#58;alert(origin))
+
+[x](javascript:1/alert(origin))
+
+- 
+- ![](x '`![](`') onerror=alert(origin) )
+
+![](`<A B="
+" onerror="alert(origin)">`)
+
+<http://onclick=alert(origin)//![](x)>
+
+![x](<"`"![x][id]
+[id]: x "<A B="" onerror="alert(origin)">`