Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/python.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ jobs:
- macos-latest
- windows-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down
80 changes: 61 additions & 19 deletions lib/markdown2.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,10 @@ def _detab(self, text: str) -> str:
output.append(self._detab_line(line))
return '\n'.join(output)

# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
# technically "self closing tags" (eg: <hr />) are not real HTML but noone cares
_void_tags = 'area|base|br|col|embed|hr|img|input|link|meta|param|source|track|wbr'

# I broke out the html5 tags here and add them to _block_tags_a and
# _block_tags_b. This way html5 tags are easy to keep track of.
_html5tags = '|address|article|aside|canvas|figcaption|figure|footer|header|main|nav|section|video'
Expand Down Expand Up @@ -906,6 +910,7 @@ def _detab(self, text: str) -> str:
_html_markdown_attr_re = re.compile(
# markdown attr, with optional assignment to true, must be followed by whitespace/boundary/closing tag chars
r'''\s+markdown(?:="1"|='1'|=1)?(?![^\s/>\b])''')

def _hash_html_block_sub(
self,
match: Union[re.Match[str], str],
Expand Down Expand Up @@ -1104,16 +1109,17 @@ def _strict_tag_block_sub(
block += chunk

if is_markup:
if chunk.startswith('%s</' % is_markup.group(1)):
tag_count -= 1
if self._tag_is_closed(is_markup.group(3), chunk):
# if close tag is in same line we must ignore these
is_markup = None
else:
# if close tag is in same line
if self._tag_is_closed(is_markup.group(3), chunk):
# we must ignore these
is_markup = None
else:
tag_count += 1
current_tag = is_markup.group(3)
# add up all the open/close tags possibly in the same line and add that to the total
current_tag = is_markup.group(3)
tag_count += self._tag_imbalance(current_tag, chunk)
elif current_tag != html_tags_re and current_tag in chunk:
# if we're looking for a specific tag then check for any opens/closes later on in the
# line that may throw off our count
tag_count += self._tag_imbalance(current_tag, chunk)

if tag_count == 0:
if is_markup:
Expand All @@ -1127,6 +1133,9 @@ def _strict_tag_block_sub(
return result

def _tag_is_closed(self, tag_name: str, text: str) -> bool:
if re.match(self._void_tags, tag_name):
return True

# check if number of open tags == number of close tags
if len(re.findall('<%s(?:.*?)>' % tag_name, text)) != text.count('</%s>' % tag_name):
return False
Expand All @@ -1136,6 +1145,29 @@ def _tag_is_closed(self, tag_name: str, text: str) -> bool:
open_index = text.find(f'<{tag_name}')
return open_index != -1 and close_index != -1 and open_index < close_index

def _tag_imbalance(self, tag_name: str, text: str) -> int:
'''
Find imbalanced HTML tags in some text

Args:
tag_name: the name of the tag (eg: "ul")
text: the text to search

Returns:
0 for balanced tags, positive int for more opening tags than closing, negative int for
more closing tags than opening
'''
if re.match(self._void_tags, tag_name):
return 0

count = 0
for tag in re.finditer(r'<(/)?%s\b>?' % tag_name, text):
if tag.group(1):
count -= 1
else:
count += 1
return count

@mark_stage(Stage.LINK_DEFS)
def _strip_link_definitions(self, text: str) -> str:
# Strips link definitions from text, stores the URLs and titles in
Expand Down Expand Up @@ -1421,13 +1453,13 @@ def _unhash_html_spans(self, text: str, spans=True, code=False) -> str:
'''
orig = ''
while text != orig:
orig = text
if spans:
for key, sanitized in list(self.html_spans.items()):
text = text.replace(key, sanitized)
if code:
for code, key in list(self._code_table.items()):
text = text.replace(key, code)
orig = text
return text

def _sanitize_html(self, s: str) -> str:
Expand Down Expand Up @@ -1518,6 +1550,12 @@ def _protect_url(self, url: str) -> str:
mime = data_url.group('mime') or ''
if mime.startswith('image/') and data_url.group('token') == ';base64':
charset='base64'
else:
url = (
self._unhash_html_spans(url, code=True)
.replace('*', self._escape_table['*'])
.replace('_', self._escape_table['_'])
)
url = _html_escape_url(url, safe_mode=self.safe_mode, charset=charset)
key = _hash_text(url)
self._escape_table[url] = key
Expand All @@ -1537,8 +1575,10 @@ def _safe_href(self):
safe = r'-\w'
# omitted ['"<>] for XSS reasons
less_safe = r'#/\.!#$%&\(\)\+,/:;=\?@\[\]^`\{\}\|~'
# html encoded colon in a URL still functions as a normal colon, so need to detect those
protocol_seperators = [':', '&#x3a;', '&#58;', '&colon;']
# dot seperated hostname, optional port number, not followed by protocol seperator
domain = r'(?:[{}]+(?:\.[{}]+)*)(?:(?<!tel):\d+/?)?(?![^:/]*:/*)'.format(safe, safe)
domain = r'(?:[{}]+(?:\.[{}]+)*)(?:(?<!tel)(?<!javascript):\d+/?)?(?![^:/]*(?:{})/*)'.format(safe, safe, '|'.join(protocol_seperators))
fragment = r'[%s]*' % (safe + less_safe)

return re.compile(r'^(?:({})?({})({})|(#|\.{{,2}}/)({}))$'.format(self._safe_protocols, domain, fragment, fragment), re.I)
Expand Down Expand Up @@ -3201,6 +3241,14 @@ def run(self, text: str):
link_text = self.md._hash_html_spans(link_text)
link_text = self.md._unhash_html_spans(link_text)

# check that this link is not inside an autolink
if any(
autolink.start() < start_idx < p < autolink.end()
for autolink in self.md._auto_link_re.finditer(text)
):
curr_pos = start_idx + 1
continue

# Possibly a footnote ref?
if "footnotes" in self.md.extras and link_text.startswith("^"):
normed_id = re.sub(r'\W', '-', link_text[1:])
Expand Down Expand Up @@ -3234,7 +3282,6 @@ def run(self, text: str):
continue

text, url, title, url_end_idx = parsed
url = self.md._unhash_html_spans(url, code=True)
# reference anchor or reference img
else:
if not self.options.get('ref', True):
Expand All @@ -3253,13 +3300,6 @@ def run(self, text: str):
curr_pos = p
continue

# -- Encode and hash the URL and title to avoid conflicts with italics/bold

url = (
url
.replace('*', self.md._escape_table['*'])
.replace('_', self.md._escape_table['_'])
)
if title:
if self.md.safe_mode:
# expose span contents for escaping - fix #691, #703
Expand All @@ -3269,6 +3309,8 @@ def run(self, text: str):
.replace('*', self.md._escape_table['*'])
.replace('_', self.md._escape_table['_'])
)
if self.md.safe_mode:
title = self.md._hash_span(title)
title_str = f' title="{title}"'
else:
title_str = ''
Expand Down
17 changes: 17 additions & 0 deletions test/tm-cases/improper_void_tag_hashing_pr705.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<hr />

<ul>
<li><p>```</p>

<ul>
<li>```</li>
</ul>

<p>x</p>

<h2>```</h2></li>
</ul>

<p><code>) &lt;script&gt;alert(origin)&lt;/script&gt;</code></p>

<h2>"</h2>
1 change: 1 addition & 0 deletions test/tm-cases/improper_void_tag_hashing_pr705.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"safe_mode": "escape"}
10 changes: 10 additions & 0 deletions test/tm-cases/improper_void_tag_hashing_pr705.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
* ```
* ```

x
```
---
```) <script>alert(origin)</script>```
"
---
6 changes: 3 additions & 3 deletions test/tm-cases/malformed_html_crash_issue584.html
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
</p
>
<pre>
<p></p
&gt;
<pre></p>
14 changes: 14 additions & 0 deletions test/tm-cases/xss_from_incorrect_block_hashing.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<ul>
<li>[x]
<ol>
<li><ul>
<li>[x]</li>
</ul></li>
</ol></li>
</ul>

<hr />

<p>[x](<code>")}&lt;img src="x</code>" onerror="alert(origin)"></p>

<hr />
1 change: 1 addition & 0 deletions test/tm-cases/xss_from_incorrect_block_hashing.opts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"safe_mode": "escape"}
5 changes: 5 additions & 0 deletions test/tm-cases/xss_from_incorrect_block_hashing.text
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- [x]
1. - [x]
___
[x](`")}<img src="x`" onerror="alert(origin)">
___
18 changes: 18 additions & 0 deletions test/tm-cases/xss_smuggling_spans_in_image_attrs.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,21 @@
<p><img src="" alt="&lt;code&gt;&quot; onerror=&quot;alert(1)//&lt;/code&gt;" /></p>

<p><img src="B" alt="A" title="&lt;C D=&quot;E&quot; onerror=alert(origin) &gt;" /></p>

<p><a href="#">x</a></p>

<p><a href="#">x</a></p>

<ul>
<li>
<ul>
<li><img src="x" alt="" title="&lt;code&gt;![](&lt;/code&gt;" /> onerror=alert(origin) )</li>
</ul></li>
</ul>

<p><img src="code&gt;&lt;A B=&quot;
&quot; onerror=&quot;alert(origin)&quot;&gt;&lt;/code" alt="" /></p>

<p><a href="http://onclick=alert(origin)//![](x)">http://onclick=alert(origin)//![](x)</a></p>

<p>![x](&lt;"`"<img src="x &quot;&lt;A B=&quot;&quot; onerror=&quot;alert(origin)&quot;&gt;`" alt="x" /></p>
17 changes: 16 additions & 1 deletion test/tm-cases/xss_smuggling_spans_in_image_attrs.text
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,19 @@

![`" onerror="alert(1)//`]()

![A](B "<C D="E" onerror=alert(origin) >")
![A](B "<C D="E" onerror=alert(origin) >")

[x](javascript&#58;alert(origin))

[x](javascript:1/alert(origin))

-
- ![](x '`![](`') onerror=alert(origin) )

![](`<A B="
" onerror="alert(origin)">`)

<http://onclick=alert(origin)//![](x)>

![x](<"`"![x][id]
[id]: x "<A B="" onerror="alert(origin)">`
Loading