From 29a9cec8dcd8cfdc6fe1a783754d9239971e5960 Mon Sep 17 00:00:00 2001 From: ffelixg <142172984+ffelixg@users.noreply.github.com> Date: Wed, 13 May 2026 12:52:23 +0200 Subject: [PATCH 1/3] Arrow fetch: request SQL_CHAR as SQL_C_WCHAR --- mssql_python/pybind/ddbc_bindings.cpp | 35 +++------------------------ tests/test_004_cursor_arrow.py | 26 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp index 66981c445..f5b0c4261 100644 --- a/mssql_python/pybind/ddbc_bindings.cpp +++ b/mssql_python/pybind/ddbc_bindings.cpp @@ -5022,9 +5022,8 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, ColumnBuffers buffers(numCols, fetchSize); if (!hasLobColumns && fetchSize > 0) { - // Bind columns — Arrow always uses SQL_C_CHAR for VARCHAR because - // it processes raw byte buffers directly, not via Python codecs. - ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_CHAR); + // Always request WCHARs so we don't have to deal with CHAR encodings + ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_WCHAR); if (!SQL_SUCCEEDED(ret)) { LOG("Error when binding columns"); return ret; @@ -5083,16 +5082,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, } case SQL_CHAR: case SQL_VARCHAR: - case SQL_LONGVARCHAR: { - ret = GetDataVar(hStmt, idxCol + 1, SQL_C_CHAR, - buffers.charBuffers[idxCol], - buffers.indicators[idxCol].data()); - if (!SQL_SUCCEEDED(ret)) { - LOG("Error fetching CHAR LOB for column %d", idxCol + 1); - return ret; - } - break; - } + case SQL_LONGVARCHAR: case SQL_SS_XML: case SQL_WCHAR: case SQL_WVARCHAR: @@ -5335,24 +5325,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, } case SQL_CHAR: case SQL_VARCHAR: - case SQL_LONGVARCHAR: { -#if defined(__APPLE__) || defined(__linux__) - uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/; -#else - uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/; -#endif - auto target_vec = &arrowColumnProducer->varData; - auto start = arrowColumnProducer->varVal[idxRowArrow]; - while (target_vec->size() < start + dataLen) { - target_vec->resize(target_vec->size() * 2); - } - - std::memcpy(&(*target_vec)[start], - &buffers.charBuffers[idxCol][idxRowSql * fetchBufferSize], - dataLen); - arrowColumnProducer->varVal[idxRowArrow + 1] = start + dataLen; - break; - } + case SQL_LONGVARCHAR: case SQL_SS_XML: case SQL_WCHAR: case SQL_WVARCHAR: diff --git a/tests/test_004_cursor_arrow.py b/tests/test_004_cursor_arrow.py index ce6163f61..61175fd5b 100644 --- a/tests/test_004_cursor_arrow.py +++ b/tests/test_004_cursor_arrow.py @@ -313,6 +313,32 @@ def test_arrow_long_string(cursor: mssql_python.Cursor): assert batch.column(0).to_pylist() == [long_string] +def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): + table = "#t_arrow_utf8_varchar" + collation = "Latin1_General_100_CI_AS_SC_UTF8" + expected = [ + "Grüße", + "你好😀", + "こんにちは", + "Привет", + "Hello 世界", + "😀😃😄😁", + "", + None, + ] + + cursor.execute(f"create table {table} (id int primary key, v varchar(64) collate {collation})") + + try: + for index, value in enumerate(expected, start=1): + cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) + tbl = cursor.execute(f"select v from {table} order by id").arrow() + assert tbl.column(0).type.equals(pa.large_string()) + assert tbl.column(0).to_pylist() == expected + finally: + cursor.execute(f"drop table if exists {table}") + + def test_rownumber_arrow_batch_interleaved_fetchmany(cursor: mssql_python.Cursor): """Verify that arrow_batch and fetchmany can be interleaved on the same result set with correct rownumber tracking and values.""" From 4ce73cacd7f3e4e258ca42a72513097bd0058303 Mon Sep 17 00:00:00 2001 From: ffelixg <142172984+ffelixg@users.noreply.github.com> Date: Wed, 13 May 2026 19:10:57 +0200 Subject: [PATCH 2/3] Make utf8 collation test optional; Add mandatory cp1252 test to make up for it --- tests/test_004_cursor_arrow.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/test_004_cursor_arrow.py b/tests/test_004_cursor_arrow.py index 61175fd5b..731721e81 100644 --- a/tests/test_004_cursor_arrow.py +++ b/tests/test_004_cursor_arrow.py @@ -327,7 +327,38 @@ def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): None, ] - cursor.execute(f"create table {table} (id int primary key, v varchar(64) collate {collation})") + try: + cursor.execute( + f"create table {table} (id int primary key, v varchar(32) collate {collation})" + ) + except Exception as exc: + pytest.skip(f"UTF-8 collation '{collation}' not supported: {exc}") + + try: + for index, value in enumerate(expected, start=1): + cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) + tbl = cursor.execute(f"select v from {table} order by id").arrow() + assert tbl.column(0).type.equals(pa.large_string()) + assert tbl.column(0).to_pylist() == expected + finally: + cursor.execute(f"drop table if exists {table}") + + +def test_arrow_varchar_utf8_collation_cp1252(cursor: mssql_python.Cursor): + table = "#t_arrow_cp1252_varchar" + collation = "SQL_Latin1_General_CP1_CI_AS" + expected = [ + "Grüße", + "café René!", + "naïve café", + "Español", + "Müller-Öztürk", + "Françoise", + "", + None, + ] + + cursor.execute(f"create table {table} (id int primary key, v varchar(32) collate {collation})") try: for index, value in enumerate(expected, start=1): From 2d5fc16a1e6c9ad34ab868c0b1d232bbe78b9d54 Mon Sep 17 00:00:00 2001 From: ffelixg <142172984+ffelixg@users.noreply.github.com> Date: Fri, 29 May 2026 01:43:59 +0200 Subject: [PATCH 3/3] Comments, test char+text ontop of varchar, test name fix --- mssql_python/pybind/ddbc_bindings.cpp | 2 ++ tests/test_004_cursor_arrow.py | 37 +++++++++++++++++++++------ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/mssql_python/pybind/ddbc_bindings.cpp b/mssql_python/pybind/ddbc_bindings.cpp index 36fdb4a02..44da0db94 100644 --- a/mssql_python/pybind/ddbc_bindings.cpp +++ b/mssql_python/pybind/ddbc_bindings.cpp @@ -4845,6 +4845,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, case SQL_WCHAR: case SQL_WVARCHAR: case SQL_WLONGVARCHAR: { + // Always request WCHARs so we don't have to deal with CHAR encodings. ret = GetDataVar(hStmt, idxCol + 1, SQL_C_WCHAR, buffers.wcharBuffers[idxCol], buffers.indicators[idxCol].data()); @@ -5088,6 +5089,7 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules, case SQL_WCHAR: case SQL_WVARCHAR: case SQL_WLONGVARCHAR: { + // We have previously fetched these as WCHARs, even for SQL_CHAR types. assert(dataLen % sizeof(SQLWCHAR) == 0); auto dataLenW = dataLen / sizeof(SQLWCHAR); auto wcharSource = diff --git a/tests/test_004_cursor_arrow.py b/tests/test_004_cursor_arrow.py index 731721e81..91e1dc1ab 100644 --- a/tests/test_004_cursor_arrow.py +++ b/tests/test_004_cursor_arrow.py @@ -313,8 +313,15 @@ def test_arrow_long_string(cursor: mssql_python.Cursor): assert batch.column(0).to_pylist() == [long_string] -def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): - table = "#t_arrow_utf8_varchar" +@pytest.mark.parametrize( + "sql_type", + [ + pytest.param("char(32)", id="char"), + pytest.param("varchar(32)", id="varchar"), + ], +) +def test_arrow_char_utf8_collation_unicode(cursor: mssql_python.Cursor, sql_type: str): + table = "#t_arrow_char_decode" collation = "Latin1_General_100_CI_AS_SC_UTF8" expected = [ "Grüße", @@ -329,7 +336,7 @@ def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): try: cursor.execute( - f"create table {table} (id int primary key, v varchar(32) collate {collation})" + f"create table {table} (id int primary key, v {sql_type} collate {collation})" ) except Exception as exc: pytest.skip(f"UTF-8 collation '{collation}' not supported: {exc}") @@ -339,13 +346,24 @@ def test_arrow_varchar_utf8_collation_unicode(cursor: mssql_python.Cursor): cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) tbl = cursor.execute(f"select v from {table} order by id").arrow() assert tbl.column(0).type.equals(pa.large_string()) - assert tbl.column(0).to_pylist() == expected + for expected_val, actual_val in zip(expected, tbl.column(0).to_pylist(), strict=True): + if actual_val is not None: + actual_val = actual_val.strip() + assert expected_val == actual_val finally: cursor.execute(f"drop table if exists {table}") -def test_arrow_varchar_utf8_collation_cp1252(cursor: mssql_python.Cursor): - table = "#t_arrow_cp1252_varchar" +@pytest.mark.parametrize( + "sql_type", + [ + pytest.param("char(32)", id="char"), + pytest.param("varchar(32)", id="varchar"), + pytest.param("text", id="text"), + ], +) +def test_arrow_char_cp1252_collation_unicode(cursor: mssql_python.Cursor, sql_type: str): + table = "#t_arrow_char_decode" collation = "SQL_Latin1_General_CP1_CI_AS" expected = [ "Grüße", @@ -358,14 +376,17 @@ def test_arrow_varchar_utf8_collation_cp1252(cursor: mssql_python.Cursor): None, ] - cursor.execute(f"create table {table} (id int primary key, v varchar(32) collate {collation})") + cursor.execute(f"create table {table} (id int primary key, v {sql_type} collate {collation})") try: for index, value in enumerate(expected, start=1): cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value) tbl = cursor.execute(f"select v from {table} order by id").arrow() assert tbl.column(0).type.equals(pa.large_string()) - assert tbl.column(0).to_pylist() == expected + for expected_val, actual_val in zip(expected, tbl.column(0).to_pylist(), strict=True): + if actual_val is not None: + actual_val = actual_val.strip() + assert expected_val == actual_val finally: cursor.execute(f"drop table if exists {table}")