[SPARK-28226][PYTHON] Document Pandas UDF mapInPandas

HyukjinKwon · HyukjinKwon · commit cdbc30213bdb · 2019-07-07T09:07:52.000+09:00
## What changes were proposed in this pull request? This PR proposes to document `MAP_ITER` with `mapInPandas`. ## How was this patch tested? Manually checked the documentation. ![Screen Shot 2019-07-05 at 1 52 30 PM](https://user-images.githubusercontent.com/6477701/60698812-26cf2d80-9f2c-11e9-8295-9c00c28f5569.png) ![Screen Shot 2019-07-05 at 1 48 53 PM](https://user-images.githubusercontent.com/6477701/60698710-ac061280-9f2b-11e9-8521-a4f361207e06.png) Closes apache#25025 from HyukjinKwon/SPARK-28226. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/docs/sql-pyspark-pandas-with-arrow.md b/docs/sql-pyspark-pandas-with-arrow.md
@@ -155,6 +155,29 @@ The following example shows how to use this type of UDF to compute mean with gro
 
 For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf)
 
+
+### Map Iterator
+
+Map iterator Pandas UDFs are used to transform data with an iterator of batches. Map iterator
+Pandas UDFs can be used with 
+[`pyspark.sql.DataFrame.mapInPandas`](api/python/pyspark.sql.html#pyspark.sql.DataFrame.mapInPandas).
+It defines a map function that transforms an iterator of `pandas.DataFrame` to another.
+
+It can return the output of arbitrary length in contrast to the scalar Pandas UDF. It maps an iterator of `pandas.DataFrame`s,
+that represents the current `DataFrame`, using the map iterator UDF and returns the result as a `DataFrame`.
+
+The following example shows how to create map iterator Pandas UDFs:
+
+<div class="codetabs">
+<div data-lang="python" markdown="1">
+{% include_example map_iter_pandas_udf python/sql/arrow.py %}
+</div>
+</div>
+
+For detailed usage, please see [`pyspark.sql.functions.pandas_udf`](api/python/pyspark.sql.html#pyspark.sql.functions.pandas_udf) and
+[`pyspark.sql.DataFrame.mapsInPandas`](api/python/pyspark.sql.html#pyspark.sql.DataFrame.mapInPandas).
+
+
 ## Usage Notes
 
 ### Supported SQL Types
diff --git a/examples/src/main/python/sql/arrow.py b/examples/src/main/python/sql/arrow.py
@@ -236,6 +236,28 @@ def mean_udf(v):
     # $example off:grouped_agg_pandas_udf$
 
 
+def map_iter_pandas_udf_example(spark):
+    # $example on:map_iter_pandas_udf$
+    import pandas as pd
+
+    from pyspark.sql.functions import pandas_udf, PandasUDFType
+
+    df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
+
+    @pandas_udf(df.schema, PandasUDFType.MAP_ITER)
+    def filter_func(batch_iter):
+        for pdf in batch_iter:
+            yield pdf[pdf.id == 1]
+
+    df.mapInPandas(filter_func).show()
+    # +---+---+
+    # | id|age|
+    # +---+---+
+    # |  1| 21|
+    # +---+---+
+    # $example off:map_iter_pandas_udf$
+
+
 if __name__ == "__main__":
     spark = SparkSession \
         .builder \
@@ -246,7 +268,13 @@ def mean_udf(v):
     dataframe_with_arrow_example(spark)
     print("Running pandas_udf scalar example")
     scalar_pandas_udf_example(spark)
+    print("Running pandas_udf scalar iterator example")
+    scalar_iter_pandas_udf_example(spark)
     print("Running pandas_udf grouped map example")
     grouped_map_pandas_udf_example(spark)
+    print("Running pandas_udf grouped agg example")
+    grouped_agg_pandas_udf_example(spark)
+    print("Running pandas_udf map iterator example")
+    map_iter_pandas_udf_example(spark)
 
     spark.stop()
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -2195,25 +2195,25 @@ def toPandas(self):
 
     def mapInPandas(self, udf):
         """
-        Maps each partition of the current :class:`DataFrame` using a pandas udf and returns
-        the result as a `DataFrame`.
+        Maps an iterator of batches in the current :class:`DataFrame` using a Pandas user-defined
+        function and returns the result as a :class:`DataFrame`.
 
-        The user-defined function should take an iterator of `pandas.DataFrame`s and return another
-        iterator of `pandas.DataFrame`s. For each partition, all columns are passed together as an
-        iterator of `pandas.DataFrame`s to the user-function and the returned iterator of
-        `pandas.DataFrame`s are combined as a :class:`DataFrame`.
+        The user-defined function should take an iterator of `pandas.DataFrame`\\s and return
+        another iterator of `pandas.DataFrame`\\s. All columns are passed
+        together as an iterator of `pandas.DataFrame`\\s to the user-defined function and the
+        returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
         Each `pandas.DataFrame` size can be controlled by
         `spark.sql.execution.arrow.maxRecordsPerBatch`.
-        Its schema must match the returnType of the pandas udf.
+        Its schema must match the returnType of the Pandas user-defined function.
 
         :param udf: A function object returned by :meth:`pyspark.sql.functions.pandas_udf`
 
         >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
         >>> df = spark.createDataFrame([(1, 21), (2, 30)],
         ...                            ("id", "age"))  # doctest: +SKIP
-        >>> @pandas_udf(df.schema, PandasUDFType.SCALAR_ITER)  # doctest: +SKIP
-        ... def filter_func(iterator):
-        ...     for pdf in iterator:
+        >>> @pandas_udf(df.schema, PandasUDFType.MAP_ITER)  # doctest: +SKIP
+        ... def filter_func(batch_iter):
+        ...     for pdf in batch_iter:
         ...         yield pdf[pdf.id == 1]
         >>> df.mapInPandas(filter_func).show()  # doctest: +SKIP
         +---+---+
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
@@ -2915,7 +2915,7 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
        :class:`MapType`, nested :class:`StructType` are currently not supported as output types.
 
-       Scalar UDFs are used with :meth:`pyspark.sql.DataFrame.withColumn` and
+       Scalar UDFs can be used with :meth:`pyspark.sql.DataFrame.withColumn` and
        :meth:`pyspark.sql.DataFrame.select`.
 
        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
@@ -3191,6 +3191,33 @@ def pandas_udf(f=None, returnType=None, functionType=None):
 
        .. seealso:: :meth:`pyspark.sql.GroupedData.agg` and :class:`pyspark.sql.Window`
 
+    5. MAP_ITER
+
+       A map iterator Pandas UDFs are used to transform data with an iterator of batches.
+       It can be used with :meth:`pyspark.sql.DataFrame.mapInPandas`.
+
+       It can return the output of arbitrary length in contrast to the scalar Pandas UDF.
+       It maps an iterator of batches in the current :class:`DataFrame` using a Pandas user-defined
+       function and returns the result as a :class:`DataFrame`.
+
+       The user-defined function should take an iterator of `pandas.DataFrame`\\s and return another
+       iterator of `pandas.DataFrame`\\s. All columns are passed together as an
+       iterator of `pandas.DataFrame`\\s to the user-defined function and the returned iterator of
+       `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
+
+       >>> df = spark.createDataFrame([(1, 21), (2, 30)],
+       ...                            ("id", "age"))  # doctest: +SKIP
+       >>> @pandas_udf(df.schema, PandasUDFType.MAP_ITER)  # doctest: +SKIP
+       ... def filter_func(batch_iter):
+       ...     for pdf in batch_iter:
+       ...         yield pdf[pdf.id == 1]
+       >>> df.mapInPandas(filter_func).show()  # doctest: +SKIP
+       +---+---+
+       | id|age|
+       +---+---+
+       |  1| 21|
+       +---+---+
+
     .. note:: The user-defined functions are considered deterministic by default. Due to
         optimization, duplicate invocations may be eliminated or the function may even be invoked
         more times than it is present in the query. If your function is not deterministic, call