Skip to content

Commit ba785b8

Browse files
encukousethmlarsonch4n3-yoonStanFromIrelandpicnixz
authored
[3.13] gh-149079: Fix O(n^2) canonical ordering in unicodedata.normalize() (GH-149080) (#150780)
Replace the insertion sort used for canonical ordering of combining characters with a hybrid approach: insertion sort for short runs (< 20) and counting sort for longer runs, reducing worst-case complexity from O(n^2) to O(n). This prevents denial of service via crafted Unicode strings with many combining characters in alternating CCC order. (cherry picked from commit 991224b) Co-authored-by: Seth Larson <seth@python.org> Co-authored-by: ch4n3-yoon <ch4n3.yoon@gmail.com> Co-authored-by: Seokchan Yoon <13852925+ch4n3-yoon@users.noreply.github.com> Co-authored-by: Stan Ulbrych <stan@python.org> Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> Co-authored-by: Serhiy Storchaka <storchaka@gmail.com> Co-authored-by: Maurycy Pawłowski-Wieroński <maurycy@maurycy.com>
1 parent 13d8f45 commit ba785b8

3 files changed

Lines changed: 151 additions & 26 deletions

File tree

Lib/test/test_unicodedata.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,34 @@ def test_issue10254(self):
562562
b = 'C\u0338' * 20 + '\xC7'
563563
self.assertEqual(self.db.normalize('NFC', a), b)
564564

565+
def test_long_combining_mark_run(self):
566+
# gh-149079: avoid quadratic canonical ordering.
567+
payload = "a" + ("\u0300\u0327" * 32)
568+
nfd = "a" + ("\u0327" * 32) + ("\u0300" * 32)
569+
nfc = "\u00e0" + ("\u0327" * 32) + ("\u0300" * 31)
570+
571+
self.assertEqual(self.db.normalize("NFD", payload), nfd)
572+
self.assertEqual(self.db.normalize("NFKD", payload), nfd)
573+
self.assertEqual(self.db.normalize("NFC", payload), nfc)
574+
self.assertEqual(self.db.normalize("NFKC", payload), nfc)
575+
576+
def test_combining_mark_run_fast_paths(self):
577+
# gh-149079: cover short runs and already-sorted long runs.
578+
short_payload = "a" + ("\u0300\u0327" * 9) + "\u0300"
579+
short_nfd = "a" + ("\u0327" * 9) + ("\u0300" * 10)
580+
short_nfc = "\u00e0" + ("\u0327" * 9) + ("\u0300" * 9)
581+
long_sorted = "a" + ("\u0327" * 30) + ("\u0300" * 30)
582+
long_sorted_nfc = "\u00e0" + ("\u0327" * 30) + ("\u0300" * 29)
583+
584+
self.assertEqual(self.db.normalize("NFD", short_payload), short_nfd)
585+
self.assertEqual(self.db.normalize("NFKD", short_payload), short_nfd)
586+
self.assertEqual(self.db.normalize("NFC", short_payload), short_nfc)
587+
self.assertEqual(self.db.normalize("NFKC", short_payload), short_nfc)
588+
self.assertEqual(self.db.normalize("NFD", long_sorted), long_sorted)
589+
self.assertEqual(self.db.normalize("NFKD", long_sorted), long_sorted)
590+
self.assertEqual(self.db.normalize("NFC", long_sorted), long_sorted_nfc)
591+
self.assertEqual(self.db.normalize("NFKC", long_sorted), long_sorted_nfc)
592+
565593
def test_issue29456(self):
566594
# Fix #29456
567595
u1176_str_a = '\u1100\u1176\u11a8'
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Fix a potential denial of service in :func:`unicodedata.normalize`. The
2+
canonical ordering step of Unicode normalization used a quadratic-time insertion
3+
sort for reordering combining characters, which could be exploited with
4+
crafted input containing many combining characters in non-canonical order.
5+
Replaced with a linear-time counting sort for long runs.

Modules/unicodedata.c

Lines changed: 118 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -508,19 +508,80 @@ get_decomp_record(PyObject *self, Py_UCS4 code,
508508
(*index)++;
509509
}
510510

511+
/* Small combining runs are usually cheaper with insertion sort. */
512+
#define CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD 20
513+
514+
static void
515+
canonical_ordering_sort_insertion(int kind, void *data,
516+
Py_ssize_t start, Py_ssize_t end)
517+
{
518+
for (Py_ssize_t i = start + 1; i < end; i++) {
519+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
520+
unsigned char combining = _getrecord_ex(code)->combining;
521+
Py_ssize_t j = i;
522+
523+
while (j > start) {
524+
Py_UCS4 previous = PyUnicode_READ(kind, data, j - 1);
525+
if (_getrecord_ex(previous)->combining <= combining) {
526+
break;
527+
}
528+
PyUnicode_WRITE(kind, data, j, previous);
529+
j--;
530+
}
531+
if (j != i) {
532+
PyUnicode_WRITE(kind, data, j, code);
533+
}
534+
}
535+
}
536+
537+
static void
538+
canonical_ordering_sort_counting(int kind, void *data,
539+
Py_ssize_t start, Py_ssize_t end,
540+
Py_UCS4 *sortbuf)
541+
{
542+
Py_ssize_t counts[256] = {0};
543+
Py_ssize_t run_length = end - start;
544+
Py_ssize_t total = 0;
545+
546+
for (Py_ssize_t i = start; i < end; i++) {
547+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
548+
unsigned char combining = _getrecord_ex(code)->combining;
549+
counts[combining]++;
550+
}
551+
552+
for (size_t i = 0; i < Py_ARRAY_LENGTH(counts); i++) {
553+
Py_ssize_t count = counts[i];
554+
counts[i] = total;
555+
total += count;
556+
}
557+
558+
/* Reuse counts[] as the next output slot for each CCC. */
559+
for (Py_ssize_t i = start; i < end; i++) {
560+
Py_UCS4 code = PyUnicode_READ(kind, data, i);
561+
unsigned char combining = _getrecord_ex(code)->combining;
562+
sortbuf[counts[combining]++] = code;
563+
}
564+
for (Py_ssize_t i = 0; i < run_length; i++) {
565+
PyUnicode_WRITE(kind, data, start + i, sortbuf[i]);
566+
}
567+
}
568+
511569
static PyObject*
512570
nfd_nfkd(PyObject *self, PyObject *input, int k)
513571
{
514572
PyObject *result;
515573
Py_UCS4 *output;
516574
Py_ssize_t i, o, osize;
517-
int kind;
518-
const void *data;
575+
int input_kind, result_kind;
576+
const void *input_data;
577+
void *result_data;
519578
/* Longest decomposition in Unicode 3.2: U+FDFA */
520579
Py_UCS4 stack[20];
521580
Py_ssize_t space, isize;
522581
int index, prefix, count, stackptr;
523582
unsigned char prev, cur;
583+
Py_UCS4 *sortbuf = NULL;
584+
Py_ssize_t sortbuflen = 0;
524585

525586
stackptr = 0;
526587
isize = PyUnicode_GET_LENGTH(input);
@@ -540,11 +601,11 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
540601
return NULL;
541602
}
542603
i = o = 0;
543-
kind = PyUnicode_KIND(input);
544-
data = PyUnicode_DATA(input);
604+
input_kind = PyUnicode_KIND(input);
605+
input_data = PyUnicode_DATA(input);
545606

546607
while (i < isize) {
547-
stack[stackptr++] = PyUnicode_READ(kind, data, i++);
608+
stack[stackptr++] = PyUnicode_READ(input_kind, input_data, i++);
548609
while(stackptr) {
549610
Py_UCS4 code = stack[--stackptr];
550611
/* Hangul Decomposition adds three characters in
@@ -611,35 +672,66 @@ nfd_nfkd(PyObject *self, PyObject *input, int k)
611672
PyMem_Free(output);
612673
if (!result)
613674
return NULL;
675+
614676
/* result is guaranteed to be ready, as it is compact. */
615-
kind = PyUnicode_KIND(result);
616-
data = PyUnicode_DATA(result);
677+
result_kind = PyUnicode_KIND(result);
678+
result_data = PyUnicode_DATA(result);
617679

618-
/* Sort canonically. */
680+
/* Sort each consecutive combining-character run canonically. */
619681
i = 0;
620-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
621-
for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
622-
cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
623-
if (prev == 0 || cur == 0 || prev <= cur) {
624-
prev = cur;
682+
while (i < o) {
683+
Py_ssize_t run_length, run_start;
684+
int needs_sort = 0;
685+
686+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
687+
prev = _getrecord_ex(ch)->combining;
688+
if (prev == 0) {
689+
i++;
625690
continue;
626691
}
627-
/* Non-canonical order. Need to switch *i with previous. */
628-
o = i - 1;
629-
while (1) {
630-
Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
631-
PyUnicode_WRITE(kind, data, o+1,
632-
PyUnicode_READ(kind, data, o));
633-
PyUnicode_WRITE(kind, data, o, tmp);
634-
o--;
635-
if (o < 0)
636-
break;
637-
prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
638-
if (prev == 0 || prev <= cur)
692+
693+
run_start = i++;
694+
while (i < o) {
695+
Py_UCS4 ch = PyUnicode_READ(result_kind, result_data, i);
696+
cur = _getrecord_ex(ch)->combining;
697+
if (cur == 0) {
639698
break;
699+
}
700+
if (prev > cur) {
701+
needs_sort = 1;
702+
}
703+
prev = cur;
704+
i++;
705+
}
706+
if (!needs_sort) {
707+
continue;
708+
}
709+
710+
run_length = i - run_start;
711+
if (run_length < CANONICAL_ORDERING_COUNTING_SORT_THRESHOLD) {
712+
canonical_ordering_sort_insertion(result_kind, result_data,
713+
run_start, i);
714+
continue;
640715
}
641-
prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
716+
717+
if (run_length > sortbuflen) {
718+
Py_UCS4 *new_sortbuf = PyMem_Resize(sortbuf,
719+
Py_UCS4,
720+
run_length);
721+
if (new_sortbuf == NULL) {
722+
PyErr_NoMemory();
723+
PyMem_Free(sortbuf);
724+
Py_DECREF(result);
725+
return NULL;
726+
}
727+
sortbuf = new_sortbuf;
728+
sortbuflen = run_length;
729+
}
730+
731+
canonical_ordering_sort_counting(result_kind, result_data,
732+
run_start, i, sortbuf);
642733
}
734+
PyMem_Free(sortbuf);
643735
return result;
644736
}
645737

0 commit comments

Comments
 (0)