apache · parthchandra · Feb 6, 2025 · Feb 7, 2025 · Feb 7, 2025 · Feb 10, 2025
diff --git a/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala b/spark/src/main/scala/org/apache/comet/CometSparkSessionExtensions.scala
@@ -1352,6 +1352,11 @@ object CometSparkSessionExtensions extends Logging {
     org.apache.spark.SPARK_VERSION >= "4.0"
   }
 
+  def isComplexTypeReaderEnabled(conf: SQLConf): Boolean = {
+    CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_ICEBERG_COMPAT ||
+    CometConf.COMET_NATIVE_SCAN_IMPL.get(conf) == CometConf.SCAN_NATIVE_DATAFUSION
+  }
+
   /** Calculates required memory overhead in MB per executor process for Comet. */
   def getCometMemoryOverheadInMiB(sparkConf: SparkConf): Long = {
     // `spark.executor.memory` default value is 1g

diff --git a/spark/src/main/scala/org/apache/comet/DataTypeSupport.scala b/spark/src/main/scala/org/apache/comet/DataTypeSupport.scala
@@ -40,7 +40,6 @@ trait DataTypeSupport {
       true
     case t: DataType if t.typeName == "timestamp_ntz" =>
       true
-      true
     case _ => false
   }
 

diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -125,6 +125,26 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  test("uint data type support") {
+    Seq(true, false).foreach { dictionaryEnabled =>
+      Seq(Byte.MaxValue, Short.MaxValue).foreach { valueRanges =>
+        {
+          withTempDir { dir =>
+            val path = new Path(dir.toURI.toString, "testuint.parquet")
+            makeParquetFileAllTypes(path, dictionaryEnabled = dictionaryEnabled, valueRanges + 1)
+            withParquetTable(path.toString, "tbl") {
+              if (CometSparkSessionExtensions.isComplexTypeReaderEnabled(conf)) {
+                checkSparkAnswer("select _9, _10 FROM tbl order by _11")
+              } else {
+                checkSparkAnswerAndOperator("select _9, _10 FROM tbl order by _11")
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
   test("null literals") {
     val batchSize = 1000
     Seq(true, false).foreach { dictionaryEnabled =>
@@ -142,6 +162,7 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
           checkSparkAnswerAndOperator(sqlString)
         }
       }
+
     }
   }
 

diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala
@@ -429,65 +429,130 @@ abstract class CometTestBase
     makeParquetFileAllTypes(path, dictionaryEnabled, 0, n)
   }
 
-  def makeParquetFileAllTypes(
-      path: Path,
-      dictionaryEnabled: Boolean,
-      begin: Int,
-      end: Int,
-      pageSize: Int = 128,
-      randomSize: Int = 0): Unit = {
-    val schemaStr =
+  def getAllTypesParquetSchema: String = {
+    if (CometSparkSessionExtensions.isComplexTypeReaderEnabled(conf)) {
+      // Comet complex type reader has different behavior for uint_8, uint_16 types.
+      // The issue stems from undefined behavior in the parquet spec and is tracked
+      // here: https://github.com/apache/parquet-java/issues/3142
+      // here: https://github.com/apache/arrow-rs/issues/7040
+      // and here: https://github.com/apache/datafusion-comet/issues/1348
       if (isSpark34Plus) {
         """
-          |message root {
-          |  optional boolean                  _1;
-          |  optional int32                    _2(INT_8);
-          |  optional int32                    _3(INT_16);
-          |  optional int32                    _4;
-          |  optional int64                    _5;
-          |  optional float                    _6;
-          |  optional double                   _7;
-          |  optional binary                   _8(UTF8);
-          |  optional int32                    _9(UINT_8);
-          |  optional int32                    _10(UINT_16);
-          |  optional int32                    _11(UINT_32);
-          |  optional int64                    _12(UINT_64);
-          |  optional binary                   _13(ENUM);
-          |  optional FIXED_LEN_BYTE_ARRAY(3)  _14;
-          |  optional int32                    _15(DECIMAL(5, 2));
-          |  optional int64                    _16(DECIMAL(18, 10));
-          |  optional FIXED_LEN_BYTE_ARRAY(16) _17(DECIMAL(38, 37));
-          |  optional INT64                    _18(TIMESTAMP(MILLIS,true));
-          |  optional INT64                    _19(TIMESTAMP(MICROS,true));
-          |  optional INT32                    _20(DATE);
-          |}
+         |message root {
+         |  optional boolean                  _1;
+         |  optional int32                    _2(INT_8);
+         |  optional int32                    _3(INT_16);
+         |  optional int32                    _4;
+         |  optional int64                    _5;
+         |  optional float                    _6;
+         |  optional double                   _7;
+         |  optional binary                   _8(UTF8);
+         |  optional int32                    _9(UINT_32);
+         |  optional int32                    _10(UINT_32);
+         |  optional int32                    _11(UINT_32);
+         |  optional int64                    _12(UINT_64);
+         |  optional binary                   _13(ENUM);
+         |  optional FIXED_LEN_BYTE_ARRAY(3)  _14;
+         |  optional int32                    _15(DECIMAL(5, 2));
+         |  optional int64                    _16(DECIMAL(18, 10));
+         |  optional FIXED_LEN_BYTE_ARRAY(16) _17(DECIMAL(38, 37));
+         |  optional INT64                    _18(TIMESTAMP(MILLIS,true));
+         |  optional INT64                    _19(TIMESTAMP(MICROS,true));
+         |  optional INT32                    _20(DATE);
+         |}
         """.stripMargin
       } else {
         """
-          |message root {
-          |  optional boolean                  _1;
-          |  optional int32                    _2(INT_8);
-          |  optional int32                    _3(INT_16);
-          |  optional int32                    _4;
-          |  optional int64                    _5;
-          |  optional float                    _6;
-          |  optional double                   _7;
-          |  optional binary                   _8(UTF8);
-          |  optional int32                    _9(UINT_8);
-          |  optional int32                    _10(UINT_16);
-          |  optional int32                    _11(UINT_32);
-          |  optional int64                    _12(UINT_64);
-          |  optional binary                   _13(ENUM);
-          |  optional binary                   _14(UTF8);
-          |  optional int32                    _15(DECIMAL(5, 2));
-          |  optional int64                    _16(DECIMAL(18, 10));
-          |  optional FIXED_LEN_BYTE_ARRAY(16) _17(DECIMAL(38, 37));
-          |  optional INT64                    _18(TIMESTAMP(MILLIS,true));
-          |  optional INT64                    _19(TIMESTAMP(MICROS,true));
-          |  optional INT32                    _20(DATE);
-          |}
+         |message root {
+         |  optional boolean                  _1;
+         |  optional int32                    _2(INT_8);
+         |  optional int32                    _3(INT_16);
+         |  optional int32                    _4;
+         |  optional int64                    _5;
+         |  optional float                    _6;
+         |  optional double                   _7;
+         |  optional binary                   _8(UTF8);
+         |  optional int32                    _9(UINT_32);
+         |  optional int32                    _10(UINT_32);
+         |  optional int32                    _11(UINT_32);
+         |  optional int64                    _12(UINT_64);
+         |  optional binary                   _13(ENUM);
+         |  optional binary                   _14(UTF8);
+         |  optional int32                    _15(DECIMAL(5, 2));
+         |  optional int64                    _16(DECIMAL(18, 10));
+         |  optional FIXED_LEN_BYTE_ARRAY(16) _17(DECIMAL(38, 37));
+         |  optional INT64                    _18(TIMESTAMP(MILLIS,true));
+         |  optional INT64                    _19(TIMESTAMP(MICROS,true));
+         |  optional INT32                    _20(DATE);
+         |}
         """.stripMargin
       }
+    } else {
+
+      if (isSpark34Plus) {
+        """
+         |message root {
+         |  optional boolean                  _1;
+         |  optional int32                    _2(INT_8);
+         |  optional int32                    _3(INT_16);
+         |  optional int32                    _4;
+         |  optional int64                    _5;
+         |  optional float                    _6;
+         |  optional double                   _7;
+         |  optional binary                   _8(UTF8);
+         |  optional int32                    _9(UINT_8);
+         |  optional int32                    _10(UINT_16);
+         |  optional int32                    _11(UINT_32);
+         |  optional int64                    _12(UINT_64);
+         |  optional binary                   _13(ENUM);
+         |  optional FIXED_LEN_BYTE_ARRAY(3)  _14;
+         |  optional int32                    _15(DECIMAL(5, 2));
+         |  optional int64                    _16(DECIMAL(18, 10));
+         |  optional FIXED_LEN_BYTE_ARRAY(16) _17(DECIMAL(38, 37));
+         |  optional INT64                    _18(TIMESTAMP(MILLIS,true));
+         |  optional INT64                    _19(TIMESTAMP(MICROS,true));
+         |  optional INT32                    _20(DATE);
+         |}
+        """.stripMargin
+      } else {
+        """
+         |message root {
+         |  optional boolean                  _1;
+         |  optional int32                    _2(INT_8);
+         |  optional int32                    _3(INT_16);
+         |  optional int32                    _4;
+         |  optional int64                    _5;
+         |  optional float                    _6;
+         |  optional double                   _7;
+         |  optional binary                   _8(UTF8);
+         |  optional int32                    _9(UINT_8);
+         |  optional int32                    _10(UINT_16);
+         |  optional int32                    _11(UINT_32);
+         |  optional int64                    _12(UINT_64);
+         |  optional binary                   _13(ENUM);
+         |  optional binary                   _14(UTF8);
+         |  optional int32                    _15(DECIMAL(5, 2));
+         |  optional int64                    _16(DECIMAL(18, 10));
+         |  optional FIXED_LEN_BYTE_ARRAY(16) _17(DECIMAL(38, 37));
+         |  optional INT64                    _18(TIMESTAMP(MILLIS,true));
+         |  optional INT64                    _19(TIMESTAMP(MICROS,true));
+         |  optional INT32                    _20(DATE);
+         |}
+        """.stripMargin
+      }
+    }
+  }
+
+  def makeParquetFileAllTypes(
+      path: Path,
+      dictionaryEnabled: Boolean,
+      begin: Int,
+      end: Int,
+      pageSize: Int = 128,
+      randomSize: Int = 0): Unit = {
+    // alwaysIncludeUnsignedIntTypes means we include unsignedIntTypes in the test even if the
+    // reader does not support them
+    val schemaStr = getAllTypesParquetSchema
 
     val schema = MessageTypeParser.parseMessageType(schemaStr)
     val writer = createParquetWriter(
-Original file line number
+Diff line change
@@ Expand Up / @@ -40,7 +40,6 @@ trait DataTypeSupport { @@
           true
         case t: DataType if t.typeName == "timestamp_ntz" =>
           true
-          true
         case _ => false
       }
@@ Expand Down @@