Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/source/user-guide/latest/expressions.md
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the
| `aggregate` | ✅ | |
| `array_sort` | ✅ | |
| `exists` | ✅ | |
| `filter` | 🔜 | General lambda not yet wired; the `array_compact` form is supported ([#4224](https://github.com/apache/datafusion-comet/issues/4224)) |
| `filter` | | General lambda routed through the JVM codegen dispatcher; the `array_compact` form runs natively |
| `forall` | ✅ | |
| `map_filter` | ✅ | |
| `map_zip_with` | ✅ | |
Expand Down Expand Up @@ -570,12 +570,12 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the
| `overlay` | ✅ | |
| `position` | ✅ | |
| `printf` | ✅ | |
| `regexp_count` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) |
| `regexp_count` | | Runs natively (rewrites to `size(regexp_extract_all(...))`) |
| `regexp_extract` | ✅ | |
| `regexp_extract_all` | ✅ | |
| `regexp_instr` | ✅ | Routed through the JVM codegen dispatcher |
| `regexp_replace` | ✅ | |
| `regexp_substr` | 🔜 | tracking [#4098](https://github.com/apache/datafusion-comet/issues/4098) |
| `regexp_substr` | | Runs natively (rewrites to `nullif(regexp_extract(...), '')`) |
| `repeat` | ✅ | |
| `replace` | ✅ | |
| `right` | ✅ | |
Expand All @@ -595,8 +595,8 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the
| `to_varchar` | ✅ | |
| `translate` | ✅ | Falls back by default; opt-in via allowIncompatible ([#4463](https://github.com/apache/datafusion-comet/issues/4463)) |
| `trim` | ✅ | |
| `try_to_binary` | 🔜 | Lowers to `TryEval(...)`, which falls back |
| `try_to_number` | 🔜 | TRY variant of `to_number` |
| `try_to_binary` | | Runs natively (rewrites to `try_eval(to_binary(...))`) |
| `try_to_number` | | Routed through the JVM codegen dispatcher |
| `ucase` | ✅ | |
| `unbase64` | ✅ | |
| `upper` | ✅ | |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim {
classOf[StringLocate] -> CometStringLocate,
classOf[UnBase64] -> CometUnBase64,
classOf[ToCharacter] -> CometToCharacter,
classOf[ToNumber] -> CometToNumber)
classOf[ToNumber] -> CometToNumber,
classOf[TryToNumber] -> CometTryToNumber)
base ++ sparkVersionSpecificStringExpressions
}

Expand Down
22 changes: 12 additions & 10 deletions spark/src/main/scala/org/apache/comet/serde/arrays.scala
Original file line number Diff line number Diff line change
Expand Up @@ -640,21 +640,23 @@ object CometFlatten extends CometExpressionSerde[Flatten] with ArraysBase {

object CometArrayFilter extends CometExpressionSerde[ArrayFilter] {

override def getUnsupportedReasons(): Seq[String] = Seq(
"Only supports `array_filter` when the function is `IsNotNull` (used by `array_compact`)")

override def getSupportLevel(expr: ArrayFilter): SupportLevel = {
expr.function.children.headOption match {
case Some(_: IsNotNull) => Compatible()
case _ => Unsupported()
}
}
override def getSupportLevel(expr: ArrayFilter): SupportLevel = Compatible()

override def convert(
expr: ArrayFilter,
inputs: Seq[Attribute],
binding: Boolean): Option[ExprOuterClass.Expr] = {
CometArrayCompact.convert(expr, inputs, binding)
expr.function.children.headOption match {
case Some(_: IsNotNull) =>
// Fast path: `array_compact` lowers to `filter(arr, x -> x is not null)`. Use the native
// array_compact serde to avoid the per-batch JNI cost of the codegen dispatcher.
CometArrayCompact.convert(expr, inputs, binding)
case _ =>
// General lambda: run Spark's own evaluation through the codegen dispatcher so the result
// matches Spark exactly, like the other higher-order functions (`transform`, `exists`).
// Falls back to Spark when the dispatcher is disabled.
CometScalaUDF.emitJvmCodegenDispatch(expr, inputs, binding)
}
}
}

Expand Down
4 changes: 3 additions & 1 deletion spark/src/main/scala/org/apache/comet/serde/strings.scala
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

package org.apache.comet.serde

import org.apache.spark.sql.catalyst.expressions.{Attribute, BitLength, Cast, Concat, ConcatWs, Elt, Expression, FindInSet, FormatNumber, FormatString, GetJsonObject, If, InitCap, IsNull, Left, Length, Levenshtein, Like, Literal, Lower, OctetLength, Overlay, RegExpExtract, RegExpExtractAll, RegExpInStr, RegExpReplace, Right, RLike, SoundEx, StringLocate, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, StringTranslate, Substring, SubstringIndex, ToCharacter, ToNumber, UnBase64, Upper}
import org.apache.spark.sql.catalyst.expressions.{Attribute, BitLength, Cast, Concat, ConcatWs, Elt, Expression, FindInSet, FormatNumber, FormatString, GetJsonObject, If, InitCap, IsNull, Left, Length, Levenshtein, Like, Literal, Lower, OctetLength, Overlay, RegExpExtract, RegExpExtractAll, RegExpInStr, RegExpReplace, Right, RLike, SoundEx, StringLocate, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, StringTranslate, Substring, SubstringIndex, ToCharacter, ToNumber, TryToNumber, UnBase64, Upper}
import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType}
import org.apache.spark.unsafe.types.UTF8String

Expand Down Expand Up @@ -643,3 +643,5 @@ object CometUnBase64 extends CometCodegenDispatch[UnBase64]
object CometToCharacter extends CometCodegenDispatch[ToCharacter]

object CometToNumber extends CometCodegenDispatch[ToNumber]

object CometTryToNumber extends CometCodegenDispatch[TryToNumber]
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ CREATE TABLE test_array_filter(arr array<int>) USING parquet
statement
INSERT INTO test_array_filter VALUES (array(1, 2, 3, 4, 5)), (array(-1, 0, 1)), (array(10)), (NULL)

query spark_answer_only
query
SELECT filter(arr, x -> x > 2) FROM test_array_filter

query spark_answer_only
query
SELECT filter(arr, x -> x >= 0) FROM test_array_filter

query spark_answer_only
query
SELECT filter(arr, (x, i) -> i > 0) FROM test_array_filter
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.

-- regexp_count is RuntimeReplaceable (Size(RegExpExtractAll(...))). Verify Comet runs it natively
-- and matches Spark.

-- MinSparkVersion: 3.5

statement
CREATE TABLE test_regexp_count(s string) USING parquet

statement
INSERT INTO test_regexp_count VALUES ('Steven Jones and Stephen Smith'), ('abcabcabc'), (''), (NULL)

query
SELECT s, regexp_count(s, 'Ste(v|ph)en') FROM test_regexp_count

query
SELECT regexp_count('abcabc', 'abc'), regexp_count('hello', 'z')
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.

-- regexp_substr is RuntimeReplaceable (NullIf(RegExpExtract(..., 0), "")). Verify Comet runs it
-- natively and matches Spark, including the no-match case which returns NULL.

-- MinSparkVersion: 3.5

statement
CREATE TABLE test_regexp_substr(s string) USING parquet

statement
INSERT INTO test_regexp_substr VALUES ('Steven Jones and Stephen Smith'), ('no match here'), (''), (NULL)

query
SELECT s, regexp_substr(s, 'Ste(v|ph)en') FROM test_regexp_substr

query
SELECT regexp_substr('user@spark.apache.org', '@[^.]*'), regexp_substr('hello', 'zzz')
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.

-- try_to_binary is RuntimeReplaceable (TryEval(ToBinary(...))). Verify Comet runs it natively and
-- matches Spark, including the invalid-hex case which returns NULL.

-- MinSparkVersion: 3.5

statement
CREATE TABLE test_try_to_binary(s string) USING parquet

statement
INSERT INTO test_try_to_binary VALUES ('616263'), ('48656c6c6f'), ('zz'), (''), (NULL)

query
SELECT s, try_to_binary(s, 'hex') FROM test_try_to_binary

query
SELECT try_to_binary('616263', 'hex'), try_to_binary('not-hex', 'hex')
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
-- Licensed to the Apache Software Foundation (ASF) under one
-- or more contributor license agreements. See the NOTICE file
-- distributed with this work for additional information
-- regarding copyright ownership. The ASF licenses this file
-- to you under the Apache License, Version 2.0 (the
-- "License"); you may not use this file except in compliance
-- with the License. You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing,
-- software distributed under the License is distributed on an
-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-- KIND, either express or implied. See the License for the
-- specific language governing permissions and limitations
-- under the License.

-- Routes try_to_number through the codegen dispatcher so behavior matches Spark exactly.
-- try_to_number returns NULL (instead of throwing) when the value does not match the format.

-- MinSparkVersion: 3.5

statement
CREATE TABLE test_try_to_number(s string) USING parquet

statement
INSERT INTO test_try_to_number VALUES ('454.00'), ('78.12'), ('0.00'), ('not-a-number'), ('$78.00'), (NULL)

query
SELECT s, try_to_number(s, '99999.99') FROM test_try_to_number

-- literal arguments: valid, and invalid (returns NULL rather than throwing)
query
SELECT try_to_number('454', '999'), try_to_number('abc', '999'), try_to_number('$12.00', '$99.99')
Loading