From f157b46d31f5918aad2e7ad492b01db496807b1c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 12 Jun 2026 07:42:54 -0600 Subject: [PATCH] feat: opt concat into codegen dispatch for non-UTF8_BINARY collations CometConcat reports Incompatible when a child uses a non-default collation, because the native concat UDF produces UTF8_BINARY and loses the collation. Mixing in CodegenDispatchFallback routes that case through the JVM codegen dispatcher (Spark's own doGenCode) so collated concat runs natively and matches Spark instead of falling back. The Unsupported non-string-input case (binary/array children) is unchanged. Part of #4596. --- .../scala/org/apache/comet/serde/strings.scala | 5 ++++- .../sql-tests/expressions/string/collation.sql | 14 ++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index c4abe8ad4e..48717ef1d1 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -274,7 +274,10 @@ object CometRight extends CometExpressionSerde[Right] { } } -object CometConcat extends CometScalarFunction[Concat]("concat") with CometTypeShim { +object CometConcat + extends CometScalarFunction[Concat]("concat") + with CometTypeShim + with CodegenDispatchFallback { private val unsupportedReason = "CONCAT supports only string input parameters" // Spark 4.0 widens Concat to accept collated strings and preserves the collation in the merged diff --git a/spark/src/test/resources/sql-tests/expressions/string/collation.sql b/spark/src/test/resources/sql-tests/expressions/string/collation.sql index 95abecbd1e..89dd95b9e7 100644 --- a/spark/src/test/resources/sql-tests/expressions/string/collation.sql +++ b/spark/src/test/resources/sql-tests/expressions/string/collation.sql @@ -32,9 +32,11 @@ SELECT collation('hello' COLLATE UTF8_BINARY) query SELECT collation(CAST(NULL AS STRING)) --- concat preserves a non-default collation in its result type, but Comet's native concat produces --- UTF8_BINARY, so it is Incompatible and falls back to Spark by default. -query expect_fallback(concat does not support non-UTF8_BINARY collations) +-- concat preserves a non-default collation in its result type, which Comet's native concat does +-- not, so concat is Incompatible. It is enrolled in the JVM codegen dispatcher, which runs Spark's +-- own doGenCode inside the Comet pipeline, so a collated concat is evaluated natively and matches +-- Spark. +query SELECT concat('Hello' COLLATE UTF8_LCASE, 'World' COLLATE UTF8_LCASE) -- reverse is enrolled in the JVM codegen dispatcher, which runs Spark's own doGenCode inside the @@ -42,9 +44,9 @@ SELECT concat('Hello' COLLATE UTF8_LCASE, 'World' COLLATE UTF8_LCASE) query SELECT reverse('Hello' COLLATE UTF8_LCASE) --- A standard ICU collation (UNICODE_CI) still falls back for concat, confirming the gate covers --- any non-UTF8_BINARY collation rather than just UTF8_LCASE. -query expect_fallback(concat does not support non-UTF8_BINARY collations) +-- A standard ICU collation (UNICODE_CI) also dispatches and matches Spark, confirming the path +-- covers any non-UTF8_BINARY collation rather than just UTF8_LCASE. +query SELECT concat('Hello' COLLATE UNICODE_CI, 'World' COLLATE UNICODE_CI) query