apache · JiaqiWang18 · Aug 23, 2025 · Aug 25, 2025 · Aug 25, 2025 · Aug 29, 2025
diff --git a/...ct/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala b/...ct/server/src/test/scala/org/apache/spark/sql/connect/pipelines/PythonPipelineSuite.scala
@@ -30,7 +30,8 @@ import org.apache.spark.api.python.PythonUtils
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.TableIdentifier
 import org.apache.spark.sql.connect.service.SparkConnectService
-import org.apache.spark.sql.pipelines.graph.DataflowGraph
+import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
+import org.apache.spark.sql.pipelines.graph.{DataflowGraph, PipelineUpdateContextImpl}
 import org.apache.spark.sql.pipelines.utils.{EventVerificationTestHelpers, TestPipelineUpdateContextMixin}
 
 /**
@@ -434,6 +435,36 @@ class PythonPipelineSuite
         .map(_.identifier) == Seq(graphIdentifier("a"), graphIdentifier("something")))
   }
 
+  test("MV/ST with partition columns works") {
+    val graph = buildGraph("""
+           |from pyspark.sql.functions import col
+           |
+           |@dp.materialized_view(partition_cols = ["id_mod"])
+           |def mv():
+           |  return spark.range(5).withColumn("id_mod", col("id") % 2)
+           |
+           |@dp.table(partition_cols = ["id_mod"])
+           |def st():
+           |  return spark.readStream.table("mv")
+           |""".stripMargin)
+
+    val updateContext = new PipelineUpdateContextImpl(graph, eventCallback = _ => ())
+    updateContext.pipelineExecution.runPipeline()
+    updateContext.pipelineExecution.awaitCompletion()
+
+    // check table is created with correct partitioning
+    val catalog = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog]
+
+    Seq("mv", "st").foreach { tableName =>
+      val table = catalog.loadTable(Identifier.of(Array("default"), tableName))
+      assert(table.partitioning().map(_.references().head.fieldNames().head) === Array("id_mod"))
+
+      val rows = spark.table(tableName).collect().map(r => (r.getLong(0), r.getLong(1))).toSet
+      val expected = (0 until 5).map(id => (id.toLong, (id % 2).toLong)).toSet
+      assert(rows == expected)
+    }
+  }
+
   test("create pipeline without table will throw RUN_EMPTY_PIPELINE exception") {
     checkError(
       exception = intercept[AnalysisException] {

diff --git a/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala b/sql/pipelines/src/main/scala/org/apache/spark/sql/pipelines/graph/FlowExecution.scala
@@ -257,6 +257,10 @@ class BatchTableWrite(
         }
         dataFrameWriter
           .mode("append")
+          // In "append" mode with saveAsTable, partition columns must be specified in query
+          // because the format and options of the existing table is used, and the table could
+          // have been created with partition columns.
+          .partitionBy(destination.partitionCols.getOrElse(Seq.empty): _*)
           .saveAsTable(destination.identifier.unquotedString)
       }
     }

diff --git a/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala b/sql/pipelines/src/test/scala/org/apache/spark/sql/pipelines/graph/SqlPipelineSuite.scala
@@ -18,6 +18,7 @@ package org.apache.spark.sql.pipelines.graph
 
 import org.apache.spark.sql.{AnalysisException, Row}
 import org.apache.spark.sql.catalyst.parser.ParseException
+import org.apache.spark.sql.connector.catalog.{Identifier, TableCatalog}
 import org.apache.spark.sql.pipelines.utils.{PipelineTest, TestGraphRegistrationContext}
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types.{LongType, StructType}
@@ -266,6 +267,44 @@ class SqlPipelineSuite extends PipelineTest with SharedSparkSession {
     )
   }
 
+  test("MV/ST with partition columns works") {
+    val unresolvedDataflowGraph = unresolvedDataflowGraphFromSql(
+      sqlText = """
+                  |CREATE MATERIALIZED VIEW mv
+                  |PARTITIONED BY (id_mod)
+                  |AS
+                  |SELECT
+                  |  id,
+                  |  id % 2 AS id_mod
+                  |FROM range(3);
+                  |
+                  |CREATE STREAMING TABLE st
+                  |PARTITIONED BY (id_mod)
+                  |AS
+                  |SELECT * FROM STREAM(mv);
+                  |""".stripMargin
+    )
+    startPipelineAndWaitForCompletion(unresolvedDataflowGraph)
+    val expected = Seq(
+      Row(0, 0),
+      Row(1, 1),
+      Row(2, 0)
+    )
+    val catalog = spark.sessionState.catalogManager.currentCatalog.asInstanceOf[TableCatalog]
+
+    Seq("mv", "st").foreach { tableName =>
+      // check table partition columns
+      val table = catalog.loadTable(Identifier.of(Array("test_db"), tableName))
+      assert(table.partitioning().map(_.references().head.fieldNames().head) === Array("id_mod"))
+
+      // check table data
+      checkAnswer(
+        spark.sql(s"SELECT * FROM ${fullyQualifiedIdentifier(tableName)}"),
+        expected
+      )
+    }
+  }
+
   test("Exception is thrown when non-identity partition columns are used") {
     val graphRegistrationContext = new TestGraphRegistrationContext(spark)
     val sqlGraphRegistrationContext = new SqlGraphRegistrationContext(graphRegistrationContext)