diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml index 40ce721a..2ca433a4 100644 --- a/.github/workflows/python.yaml +++ b/.github/workflows/python.yaml @@ -15,9 +15,9 @@ jobs: - macos-latest - windows-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/lib/markdown2.py b/lib/markdown2.py index dc698970..6518d783 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -862,6 +862,10 @@ def _detab(self, text: str) -> str: output.append(self._detab_line(line)) return '\n'.join(output) + # https://developer.mozilla.org/en-US/docs/Glossary/Void_element + # technically "self closing tags" (eg:
) are not real HTML but noone cares + _void_tags = 'area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr' + # I broke out the html5 tags here and add them to _block_tags_a and # _block_tags_b. This way html5 tags are easy to keep track of. _html5tags = '|address|article|aside|canvas|figcaption|figure|footer|header|main|nav|section|video' @@ -906,6 +910,7 @@ def _detab(self, text: str) -> str: _html_markdown_attr_re = re.compile( # markdown attr, with optional assignment to true, must be followed by whitespace/boundary/closing tag chars r'''\s+markdown(?:="1"|='1'|=1)?(?![^\s/>\b])''') + def _hash_html_block_sub( self, match: Union[re.Match[str], str], @@ -1104,16 +1109,17 @@ def _strict_tag_block_sub( block += chunk if is_markup: - if chunk.startswith('%s bool: + if re.match(self._void_tags, tag_name): + return True + # check if number of open tags == number of close tags if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != text.count('' % tag_name): return False @@ -1136,6 +1145,29 @@ def _tag_is_closed(self, tag_name: str, text: str) -> bool: open_index = text.find(f'<{tag_name}') return open_index != -1 and close_index != -1 and open_index < close_index + def _tag_imbalance(self, tag_name: str, text: str) -> int: + ''' + Find imbalanced HTML tags in some text + + Args: + tag_name: the name of the tag (eg: "ul") + text: the text to search + + Returns: + 0 for balanced tags, positive int for more opening tags than closing, negative int for + more closing tags than opening + ''' + if re.match(self._void_tags, tag_name): + return 0 + + count = 0 + for tag in re.finditer(r'<(/)?%s\b>?' % tag_name, text): + if tag.group(1): + count -= 1 + else: + count += 1 + return count + @mark_stage(Stage.LINK_DEFS) def _strip_link_definitions(self, text: str) -> str: # Strips link definitions from text, stores the URLs and titles in @@ -1421,13 +1453,13 @@ def _unhash_html_spans(self, text: str, spans=True, code=False) -> str: ''' orig = '' while text != orig: + orig = text if spans: for key, sanitized in list(self.html_spans.items()): text = text.replace(key, sanitized) if code: for code, key in list(self._code_table.items()): text = text.replace(key, code) - orig = text return text def _sanitize_html(self, s: str) -> str: @@ -1518,6 +1550,12 @@ def _protect_url(self, url: str) -> str: mime = data_url.group('mime') or '' if mime.startswith('image/') and data_url.group('token') == ';base64': charset='base64' + else: + url = ( + self._unhash_html_spans(url, code=True) + .replace('*', self._escape_table['*']) + .replace('_', self._escape_table['_']) + ) url = _html_escape_url(url, safe_mode=self.safe_mode, charset=charset) key = _hash_text(url) self._escape_table[url] = key @@ -1537,8 +1575,10 @@ def _safe_href(self): safe = r'-\w' # omitted ['"<>] for XSS reasons less_safe = r'#/\.!#$%&\(\)\+,/:;=\?@\[\]^`\{\}\|~' + # html encoded colon in a URL still functions as a normal colon, so need to detect those + protocol_seperators = [':', ':', ':', ':'] # dot seperated hostname, optional port number, not followed by protocol seperator - domain = r'(?:[{}]+(?:\.[{}]+)*)(?:(? + + + +

) <script>alert(origin)</script>

+ +

"

diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.opts b/test/tm-cases/improper_void_tag_hashing_pr705.opts new file mode 100644 index 00000000..ad487c04 --- /dev/null +++ b/test/tm-cases/improper_void_tag_hashing_pr705.opts @@ -0,0 +1 @@ +{"safe_mode": "escape"} diff --git a/test/tm-cases/improper_void_tag_hashing_pr705.text b/test/tm-cases/improper_void_tag_hashing_pr705.text new file mode 100644 index 00000000..223862cf --- /dev/null +++ b/test/tm-cases/improper_void_tag_hashing_pr705.text @@ -0,0 +1,10 @@ +--- +* ``` + * ``` + + x +``` +--- +```) ``` +" +--- diff --git a/test/tm-cases/malformed_html_crash_issue584.html b/test/tm-cases/malformed_html_crash_issue584.html index e2071f84..00f32cdb 100644 --- a/test/tm-cases/malformed_html_crash_issue584.html +++ b/test/tm-cases/malformed_html_crash_issue584.html @@ -1,3 +1,3 @@ -

-
+

diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.html b/test/tm-cases/xss_from_incorrect_block_hashing.html new file mode 100644 index 00000000..db2fe827 --- /dev/null +++ b/test/tm-cases/xss_from_incorrect_block_hashing.html @@ -0,0 +1,14 @@ + + +
+ +

[x](")}<img src="x" onerror="alert(origin)">

+ +
diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.opts b/test/tm-cases/xss_from_incorrect_block_hashing.opts new file mode 100644 index 00000000..54de31a8 --- /dev/null +++ b/test/tm-cases/xss_from_incorrect_block_hashing.opts @@ -0,0 +1 @@ +{"safe_mode": "escape"} \ No newline at end of file diff --git a/test/tm-cases/xss_from_incorrect_block_hashing.text b/test/tm-cases/xss_from_incorrect_block_hashing.text new file mode 100644 index 00000000..d0770a4a --- /dev/null +++ b/test/tm-cases/xss_from_incorrect_block_hashing.text @@ -0,0 +1,5 @@ +- [x] + 1. - [x] +___ +[x](`")} +___ diff --git a/test/tm-cases/xss_smuggling_spans_in_image_attrs.html b/test/tm-cases/xss_smuggling_spans_in_image_attrs.html index 37cd276e..ed1ca655 100644 --- a/test/tm-cases/xss_smuggling_spans_in_image_attrs.html +++ b/test/tm-cases/xss_smuggling_spans_in_image_attrs.html @@ -3,3 +3,21 @@

<code>" onerror="alert(1)//</code>

A

+ +

x

+ +

x

+ + + +

+ +

http://onclick=alert(origin)//![](x)

+ +

![x](<"`"x

diff --git a/test/tm-cases/xss_smuggling_spans_in_image_attrs.text b/test/tm-cases/xss_smuggling_spans_in_image_attrs.text index 4a5c25a8..bee50136 100644 --- a/test/tm-cases/xss_smuggling_spans_in_image_attrs.text +++ b/test/tm-cases/xss_smuggling_spans_in_image_attrs.text @@ -2,4 +2,19 @@ ![`" onerror="alert(1)//`]() -![A](B "") \ No newline at end of file +![A](B "") + +[x](javascript:alert(origin)) + +[x](javascript:1/alert(origin)) + +- +- ![](x '`![](`') onerror=alert(origin) ) + +![](``) + + + +![x](<"`"![x][id] +[id]: x "`