From ab3664065892c624a924786a40b54112a5ee5c19 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 12 Jun 2026 07:15:32 -0600 Subject: [PATCH] feat: wire mask and map (create_map) through codegen dispatch Routes Mask and CreateMap through the JVM codegen dispatcher so they run inside the Comet pipeline with Spark-exact results, with SQL file test coverage and a reference status update. --- docs/source/user-guide/latest/expressions.md | 4 +-- .../apache/comet/serde/QueryPlanSerde.scala | 6 ++-- .../scala/org/apache/comet/serde/maps.scala | 2 ++ .../org/apache/comet/serde/strings.scala | 4 ++- .../sql-tests/expressions/map/create_map.sql | 30 +++++++++++++++++ .../sql-tests/expressions/string/mask.sql | 32 +++++++++++++++++++ 6 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 spark/src/test/resources/sql-tests/expressions/map/create_map.sql create mode 100644 spark/src/test/resources/sql-tests/expressions/string/mask.sql diff --git a/docs/source/user-guide/latest/expressions.md b/docs/source/user-guide/latest/expressions.md index 899fe4b728..864962675f 100644 --- a/docs/source/user-guide/latest/expressions.md +++ b/docs/source/user-guide/latest/expressions.md @@ -375,7 +375,7 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the | Function | Status | Notes | | --- | --- | --- | | `element_at` | ✅ | MapType input falls back | -| `map` | 🔜 | Constructs a map | +| `map` | ✅ | Routed through the JVM codegen dispatcher | | `map_concat` | ✅ | | | `map_contains_key` | ✅ | | | `map_entries` | ✅ | | @@ -561,7 +561,7 @@ expression-level). The `outer` variants are wired but marked `Incompatible`; the | `lpad` | ✅ | | | `ltrim` | ✅ | | | `luhn_check` | ✅ | Native via `StaticInvoke` (tests: luhn_check.sql) | -| `mask` | 🔜 | Data masking | +| `mask` | ✅ | Routed through the JVM codegen dispatcher | | `octet_length` | ✅ | | | `overlay` | ✅ | | | `position` | ✅ | | diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala index 7ee74bb3ab..532ca08cdb 100644 --- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala +++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala @@ -182,7 +182,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[MapFilter] -> CometMapFilter, classOf[TransformKeys] -> CometTransformKeys, classOf[TransformValues] -> CometTransformValues, - classOf[MapZipWith] -> CometMapZipWith) + classOf[MapZipWith] -> CometMapZipWith, + classOf[CreateMap] -> CometCreateMap) base ++ sparkVersionSpecificMapExpressions } @@ -253,7 +254,8 @@ object QueryPlanSerde extends Logging with CometExprShim with CometTypeShim { classOf[StringLocate] -> CometStringLocate, classOf[UnBase64] -> CometUnBase64, classOf[ToCharacter] -> CometToCharacter, - classOf[ToNumber] -> CometToNumber) + classOf[ToNumber] -> CometToNumber, + classOf[Mask] -> CometMask) base ++ sparkVersionSpecificStringExpressions } diff --git a/spark/src/main/scala/org/apache/comet/serde/maps.scala b/spark/src/main/scala/org/apache/comet/serde/maps.scala index 42ee01abe6..e813b538dc 100644 --- a/spark/src/main/scala/org/apache/comet/serde/maps.scala +++ b/spark/src/main/scala/org/apache/comet/serde/maps.scala @@ -182,6 +182,8 @@ object CometStrToMap extends CometScalarFunction[StringToMap]("str_to_map") { } } +object CometCreateMap extends CometCodegenDispatch[CreateMap] + object CometMapFilter extends CometCodegenDispatch[MapFilter] object CometTransformKeys extends CometCodegenDispatch[TransformKeys] diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala b/spark/src/main/scala/org/apache/comet/serde/strings.scala index c4abe8ad4e..af304fa6b7 100644 --- a/spark/src/main/scala/org/apache/comet/serde/strings.scala +++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala @@ -21,7 +21,7 @@ package org.apache.comet.serde import java.util.Locale -import org.apache.spark.sql.catalyst.expressions.{Attribute, BitLength, Cast, Concat, ConcatWs, Elt, Expression, FindInSet, FormatNumber, FormatString, GetJsonObject, If, InitCap, IsNull, Left, Length, Levenshtein, Like, Literal, Lower, OctetLength, Overlay, RegExpExtract, RegExpExtractAll, RegExpInStr, RegExpReplace, Right, RLike, SoundEx, StringLocate, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, StringTranslate, Substring, SubstringIndex, ToCharacter, ToNumber, UnBase64, Upper} +import org.apache.spark.sql.catalyst.expressions.{Attribute, BitLength, Cast, Concat, ConcatWs, Elt, Expression, FindInSet, FormatNumber, FormatString, GetJsonObject, If, InitCap, IsNull, Left, Length, Levenshtein, Like, Literal, Lower, Mask, OctetLength, Overlay, RegExpExtract, RegExpExtractAll, RegExpInStr, RegExpReplace, Right, RLike, SoundEx, StringLocate, StringLPad, StringRepeat, StringReplace, StringRPad, StringSplit, StringTranslate, Substring, SubstringIndex, ToCharacter, ToNumber, UnBase64, Upper} import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType} import org.apache.spark.unsafe.types.UTF8String @@ -602,3 +602,5 @@ object CometUnBase64 extends CometCodegenDispatch[UnBase64] object CometToCharacter extends CometCodegenDispatch[ToCharacter] object CometToNumber extends CometCodegenDispatch[ToNumber] + +object CometMask extends CometCodegenDispatch[Mask] diff --git a/spark/src/test/resources/sql-tests/expressions/map/create_map.sql b/spark/src/test/resources/sql-tests/expressions/map/create_map.sql new file mode 100644 index 0000000000..2dcfad3f6f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/map/create_map.sql @@ -0,0 +1,30 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- map(...) constructor routed through the codegen dispatcher. Map keys may not be null in Spark. + +statement +CREATE TABLE test_create_map(k string, v int) USING parquet + +statement +INSERT INTO test_create_map VALUES ('a', 1), ('b', 2), ('c', NULL) + +query +SELECT map(k, v) FROM test_create_map + +query +SELECT map(1, 'a', 2, 'b'), map('x', array(1, 2), 'y', array(3)) diff --git a/spark/src/test/resources/sql-tests/expressions/string/mask.sql b/spark/src/test/resources/sql-tests/expressions/string/mask.sql new file mode 100644 index 0000000000..b24cf5726f --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/string/mask.sql @@ -0,0 +1,32 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- mask routed through the codegen dispatcher. + +-- MinSparkVersion: 3.5 + +statement +CREATE TABLE test_mask(s string) USING parquet + +statement +INSERT INTO test_mask VALUES ('AbCD123-$#'), ('Spark'), (''), (NULL) + +query +SELECT mask(s), s FROM test_mask + +query +SELECT mask('AbCD123-$#', 'Q', 'q', 'd', 'o'), mask('Spark', 'X')