Source code for pyspark.sql.column

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# mypy: disable-error-code="empty-body"

import sys
from typing import (
    overload,
    Any,
    Callable,
    TYPE_CHECKING,
    Union,
)

from pyspark.sql.tvf_argument import TableValuedFunctionArgument
from pyspark.sql.utils import dispatch_col_method
from pyspark.sql.types import DataType
from pyspark.errors import PySparkValueError

if TYPE_CHECKING:
    from pyspark.sql._typing import LiteralType, DecimalLiteral, DateTimeLiteral
    from pyspark.sql.window import WindowSpec

__all__ = ["Column"]


[docs]class Column(TableValuedFunctionArgument):

    """
    A column in a DataFrame.

    .. versionadded:: 1.3.0

    .. versionchanged:: 3.4.0
        Supports Spark Connect.

    Examples
    --------
    Column instances can be created by

    >>> df = spark.createDataFrame(
    ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])

    Select a column out of a DataFrame
    >>> df.name
    Column<'name'>
    >>> df["name"]
    Column<'name'>

    Create from an expression

    >>> df.age + 1
    Column<...>
    >>> 1 / df.age
    Column<...>
    """

    # HACK ALERT!! this is to reduce the backward compatibility concern, and returns
    # Spark Classic Column by default. This is NOT an API, and NOT supposed to
    # be directly invoked. DO NOT use this constructor.
    def __new__(cls, *args: Any, **kwargs: Any) -> "Column":
        from pyspark.sql.classic.column import Column

        return Column.__new__(Column, *args, **kwargs)

    # arithmetic operators
    @dispatch_col_method
    def __neg__(self) -> "Column":
        ...

    @dispatch_col_method
    def __add__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __sub__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __mul__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __div__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __truediv__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __mod__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __radd__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __rsub__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __rmul__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __rdiv__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __rtruediv__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __rmod__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __pow__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __rpow__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    # logistic operators
    @dispatch_col_method
    def __eq__(  # type: ignore[override]
        self,
        other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"],
    ) -> "Column":
        """binary function"""
        ...

    @dispatch_col_method
    def __ne__(  # type: ignore[override]
        self,
        other: Any,
    ) -> "Column":
        """binary function"""
        ...

    @dispatch_col_method
    def __lt__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __le__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __ge__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __gt__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

[docs]    @dispatch_col_method
    def eqNullSafe(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        Equality test that is safe for null values.

        .. versionadded:: 2.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other
            a value or :class:`Column`

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df1 = spark.createDataFrame([
        ...     Row(id=1, value='foo'),
        ...     Row(id=2, value=None)
        ... ])
        >>> df1.select(
        ...     df1['value'] == 'foo',
        ...     df1['value'].eqNullSafe('foo'),
        ...     df1['value'].eqNullSafe(None)
        ... ).show()
        +-------------+---------------+----------------+
        |(value = foo)|(value <=> foo)|(value <=> NULL)|
        +-------------+---------------+----------------+
        |         true|           true|           false|
        |         NULL|          false|            true|
        +-------------+---------------+----------------+
        >>> df2 = spark.createDataFrame([
        ...     Row(value = 'bar'),
        ...     Row(value = None)
        ... ])
        >>> df1.join(df2, df1["value"] == df2["value"]).count()
        0
        >>> df1.join(df2, df1["value"].eqNullSafe(df2["value"])).count()
        1
        >>> df2 = spark.createDataFrame([
        ...     Row(id=1, value=float('NaN')),
        ...     Row(id=2, value=42.0),
        ...     Row(id=3, value=None)
        ... ])
        >>> df2.select(
        ...     df2['value'].eqNullSafe(None),
        ...     df2['value'].eqNullSafe(float('NaN')),
        ...     df2['value'].eqNullSafe(42.0)
        ... ).show()
        +----------------+---------------+----------------+
        |(value <=> NULL)|(value <=> NaN)|(value <=> 42.0)|
        +----------------+---------------+----------------+
        |           false|           true|           false|
        |           false|          false|            true|
        |            true|          false|           false|
        +----------------+---------------+----------------+

        Notes
        -----
        Unlike Pandas, PySpark doesn't consider NaN values to be NULL. See the
        `NaN Semantics <https://spark.apache.org/docs/latest/sql-ref-datatypes.html#nan-semantics>`_
        for details.
        """
        ...

    # `and`, `or`, `not` cannot be overloaded in Python,
    # so use bitwise operators as boolean operators
    @dispatch_col_method
    def __and__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __or__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __invert__(self) -> "Column":
        ...

    @dispatch_col_method
    def __rand__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    @dispatch_col_method
    def __ror__(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        ...

    # container operators
    @dispatch_col_method
    def __contains__(self, item: Any) -> None:
        raise PySparkValueError(
            errorClass="CANNOT_APPLY_IN_FOR_COLUMN",
            messageParameters={},
        )

    # bitwise operators
[docs]    @dispatch_col_method
    def bitwiseOR(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        Compute bitwise OR of this expression with another expression.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other
            a value or :class:`Column` to calculate bitwise or(|) with
            this :class:`Column`.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([Row(a=170, b=75)])
        >>> df.select(df.a.bitwiseOR(df.b)).collect()
        [Row((a | b)=235)]
        """
        ...

[docs]    @dispatch_col_method
    def bitwiseAND(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        Compute bitwise AND of this expression with another expression.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other
            a value or :class:`Column` to calculate bitwise and(&) with
            this :class:`Column`.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([Row(a=170, b=75)])
        >>> df.select(df.a.bitwiseAND(df.b)).collect()
        [Row((a & b)=10)]
        """
        ...

[docs]    @dispatch_col_method
    def bitwiseXOR(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        Compute bitwise XOR of this expression with another expression.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other
            a value or :class:`Column` to calculate bitwise xor(^) with
            this :class:`Column`.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([Row(a=170, b=75)])
        >>> df.select(df.a.bitwiseXOR(df.b)).collect()
        [Row((a ^ b)=225)]
        """
        ...

[docs]    @dispatch_col_method
    def getItem(self, key: Any) -> "Column":
        """
        An expression that gets an item at position ``ordinal`` out of a list,
        or gets an item by key out of a dict.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        key
            a literal value, or a :class:`Column` expression.
            The result will only be true at a location if the item matches in the column.

             .. deprecated:: 3.0.0
                 :class:`Column` as a parameter is deprecated.

        Returns
        -------
        :class:`Column`
            Column representing the item(s) got at position out of a list or by key out of a dict.

        Examples
        --------
        >>> df = spark.createDataFrame([([1, 2], {"key": "value"})], ["l", "d"])
        >>> df.select(df.l.getItem(0), df.d.getItem("key")).show()
        +----+------+
        |l[0]|d[key]|
        +----+------+
        |   1| value|
        +----+------+
        """
        ...

[docs]    @dispatch_col_method
    def getField(self, name: Any) -> "Column":
        """
        An expression that gets a field by name in a :class:`StructType`.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        name
            a literal value, or a :class:`Column` expression.
            The result will only be true at a location if the field matches in the Column.

             .. deprecated:: 3.0.0
                 :class:`Column` as a parameter is deprecated.
        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column got by name.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([Row(r=Row(a=1, b="b"))])
        >>> df.select(df.r.getField("b")).show()
        +---+
        |r.b|
        +---+
        |  b|
        +---+
        >>> df.select(df.r.a).show()
        +---+
        |r.a|
        +---+
        |  1|
        +---+
        """
        ...

[docs]    @dispatch_col_method
    def withField(self, fieldName: str, col: "Column") -> "Column":
        """
        An expression that adds/replaces a field in :class:`StructType` by name.

        .. versionadded:: 3.1.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        fieldName : str
            a literal value.
            The result will only be true at a location if any field matches in the Column.
        col : :class:`Column`
            A :class:`Column` expression for the column with `fieldName`.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column
            which field was added/replaced by fieldName.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> from pyspark.sql.functions import lit
        >>> df = spark.createDataFrame([Row(a=Row(b=1, c=2))])
        >>> df.withColumn('a', df['a'].withField('b', lit(3))).select('a.b').show()
        +---+
        |  b|
        +---+
        |  3|
        +---+
        >>> df.withColumn('a', df['a'].withField('d', lit(4))).select('a.d').show()
        +---+
        |  d|
        +---+
        |  4|
        +---+
        """
        ...

[docs]    @dispatch_col_method
    def dropFields(self, *fieldNames: str) -> "Column":
        """
        An expression that drops fields in :class:`StructType` by name.
        This is a no-op if the schema doesn't contain field name(s).

        .. versionadded:: 3.1.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        fieldNames : str
            Desired field names (collects all positional arguments passed)
            The result will drop at a location if any field matches in the Column.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column with field dropped by fieldName.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> from pyspark.sql.functions import col, lit
        >>> df = spark.createDataFrame([
        ...     Row(a=Row(b=1, c=2, d=3, e=Row(f=4, g=5, h=6)))])
        >>> df.withColumn('a', df['a'].dropFields('b')).show()
        +-----------------+
        |                a|
        +-----------------+
        |{2, 3, {4, 5, 6}}|
        +-----------------+

        >>> df.withColumn('a', df['a'].dropFields('b', 'c')).show()
        +--------------+
        |             a|
        +--------------+
        |{3, {4, 5, 6}}|
        +--------------+

        This method supports dropping multiple nested fields directly e.g.

        >>> df.withColumn("a", col("a").dropFields("e.g", "e.h")).show()
        +--------------+
        |             a|
        +--------------+
        |{1, 2, 3, {4}}|
        +--------------+

        However, if you are going to add/replace multiple nested fields,
        it is preferred to extract out the nested struct before
        adding/replacing multiple fields e.g.

        >>> df.select(col("a").withField(
        ...     "e", col("a.e").dropFields("g", "h")).alias("a")
        ... ).show()
        +--------------+
        |             a|
        +--------------+
        |{1, 2, 3, {4}}|
        +--------------+

        """
        ...

[docs]    @dispatch_col_method
    def __getattr__(self, item: Any) -> "Column":
        """
        An expression that gets an item at position ``ordinal`` out of a list,
        or gets an item by key out of a dict.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        item
            a literal value.

        Returns
        -------
        :class:`Column`
            Column representing the item got by key out of a dict.

        Examples
        --------
        >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
        >>> df.select(df.d.key).show()
        +------+
        |d[key]|
        +------+
        | value|
        +------+
        """
        ...

[docs]    @dispatch_col_method
    def __getitem__(self, k: Any) -> "Column":
        """
        An expression that gets an item at position ``ordinal`` out of a list,
        or gets an item by key out of a dict.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        k
            a literal value, or a slice object without step.

        Returns
        -------
        :class:`Column`
            Column representing the item got by key out of a dict, or substrings sliced by
            the given slice object.

        Examples
        --------
        >>> df = spark.createDataFrame([('abcedfg', {"key": "value"})], ["l", "d"])
        >>> df.select(df.l[slice(1, 3)], df.d['key']).show()
        +---------------+------+
        |substr(l, 1, 3)|d[key]|
        +---------------+------+
        |            abc| value|
        +---------------+------+
        """
        ...

    @dispatch_col_method
    def __iter__(self) -> None:
        ...

    # string methods
[docs]    @dispatch_col_method
    def contains(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        Contains the other element. Returns a boolean :class:`Column` based on a string match.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other
            string in line. A value as a literal or a :class:`Column`.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.filter(df.name.contains('o')).collect()
        [Row(age=5, name='Bob')]
        """
        ...

[docs]    @dispatch_col_method
    def startswith(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        String starts with. Returns a boolean :class:`Column` based on a string match.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`Column` or str
            string at start of line (do not use a regex `^`)

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.filter(df.name.startswith('Al')).collect()
        [Row(age=2, name='Alice')]
        >>> df.filter(df.name.startswith('^Al')).collect()
        []
        """
        ...

[docs]    @dispatch_col_method
    def endswith(
        self, other: Union["Column", "LiteralType", "DecimalLiteral", "DateTimeLiteral"]
    ) -> "Column":
        """
        String ends with. Returns a boolean :class:`Column` based on a string match.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : :class:`Column` or str
            string at end of line (do not use a regex `$`)

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.filter(df.name.endswith('ice')).collect()
        [Row(age=2, name='Alice')]
        >>> df.filter(df.name.endswith('ice$')).collect()
        []
        """
        ...

[docs]    @dispatch_col_method
    def like(self: "Column", other: str) -> "Column":
        """
        SQL like expression. Returns a boolean :class:`Column` based on a SQL LIKE match.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : str
            a SQL LIKE pattern

        See Also
        --------
        pyspark.sql.Column.rlike

        Returns
        -------
        :class:`Column`
            Column of booleans showing whether each element
            in the Column is matched by SQL LIKE pattern.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.filter(df.name.like('Al%')).collect()
        [Row(age=2, name='Alice')]
        """
        ...

[docs]    @dispatch_col_method
    def rlike(self: "Column", other: str) -> "Column":
        """
        SQL RLIKE expression (LIKE with Regex). Returns a boolean :class:`Column` based on a regex
        match.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : str
            an extended regex expression

        Returns
        -------
        :class:`Column`
            Column of booleans showing whether each element
            in the Column is matched by extended regex expression.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.filter(df.name.rlike('ice$')).collect()
        [Row(age=2, name='Alice')]
        """
        ...

[docs]    @dispatch_col_method
    def ilike(self: "Column", other: str) -> "Column":
        """
        SQL ILIKE expression (case insensitive LIKE). Returns a boolean :class:`Column`
        based on a case insensitive match.

        .. versionadded:: 3.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        other : str
            a SQL LIKE pattern

        See Also
        --------
        pyspark.sql.Column.rlike

        Returns
        -------
        :class:`Column`
            Column of booleans showing whether each element
            in the Column is matched by SQL LIKE pattern.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.filter(df.name.ilike('%Ice')).collect()
        [Row(age=2, name='Alice')]
        """
        ...

    @overload
    def substr(self, startPos: int, length: int) -> "Column":
        ...

    @overload
    def substr(self, startPos: "Column", length: "Column") -> "Column":
        ...

[docs]    @dispatch_col_method
    def substr(self, startPos: Union[int, "Column"], length: Union[int, "Column"]) -> "Column":
        """
        Return a :class:`Column` which is a substring of the column.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        startPos : :class:`Column` or int
            start position
        length : :class:`Column` or int
            length of the substring

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column is substr of origin Column.

        Examples
        --------

        Example 1. Using integers for the input arguments.

        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.select(df.name.substr(1, 3).alias("col")).collect()
        [Row(col='Ali'), Row(col='Bob')]

        Example 2. Using columns for the input arguments.

        >>> df = spark.createDataFrame(
        ...      [(3, 4, "Alice"), (2, 3, "Bob")], ["sidx", "eidx", "name"])
        >>> df.select(df.name.substr(df.sidx, df.eidx).alias("col")).collect()
        [Row(col='ice'), Row(col='ob')]
        """
        ...

[docs]    @dispatch_col_method
    def isin(self, *cols: Any) -> "Column":
        """
        A boolean expression that is evaluated to true if the value of this
        expression is contained by the evaluated values of the arguments.

        .. versionadded:: 1.5.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        .. versionchanged:: 4.1.0
            Also takes a single :class:`DataFrame` to be used as IN subquery.

        Parameters
        ----------
        cols : Any
            The values to compare with the column values. The result will only be true at a location
            if any value matches in the Column.

        Returns
        -------
        :class:`Column`
            Column of booleans showing whether each element in the Column is contained in cols.

        Examples
        --------
        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob"), (8, "Mike")], ["age", "name"])

        Example 1: Filter rows with names in the specified values

        >>> df[df.name.isin("Bob", "Mike")].orderBy("age").show()
        +---+----+
        |age|name|
        +---+----+
        |  5| Bob|
        |  8|Mike|
        +---+----+

        Example 2: Filter rows with ages in the specified list

        >>> df[df.age.isin([1, 2, 3])].show()
        +---+-----+
        |age| name|
        +---+-----+
        |  2|Alice|
        +---+-----+

        Example 3: Filter rows with names not in the specified values

        >>> df[~df.name.isin("Alice", "Bob")].show()
        +---+----+
        |age|name|
        +---+----+
        |  8|Mike|
        +---+----+

        Example 4: Take a :class:`DataFrame` and work as IN subquery

        >>> df.where(df.age.isin(spark.range(6))).orderBy("age").show()
        +---+-----+
        |age| name|
        +---+-----+
        |  2|Alice|
        |  5|  Bob|
        +---+-----+

        Example 5: Multiple values for IN subquery

        >>> from pyspark.sql.functions import lit, struct
        >>> df.where(struct(df.age, df.name).isin(spark.range(6).select("id", lit("Bob")))).show()
        +---+----+
        |age|name|
        +---+----+
        |  5| Bob|
        +---+----+
        """
        ...

    # order
[docs]    @dispatch_col_method
    def asc(self) -> "Column":
        """
        Returns a sort expression based on the ascending order of the column.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
        >>> df.select(df.name).orderBy(df.name.asc()).collect()
        [Row(name='Alice'), Row(name='Tom')]
        """
        ...

[docs]    @dispatch_col_method
    def asc_nulls_first(self) -> "Column":
        """
        Returns a sort expression based on ascending order of the column, and null values
        return before non-null values.

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame(
        ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
        >>> df.select(df.name).orderBy(df.name.asc_nulls_first()).collect()
        [Row(name=None), Row(name='Alice'), Row(name='Tom')]

        """
        ...

[docs]    @dispatch_col_method
    def asc_nulls_last(self) -> "Column":
        """
        Returns a sort expression based on ascending order of the column, and null values
        appear after non-null values.

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame(
        ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
        >>> df.select(df.name).orderBy(df.name.asc_nulls_last()).collect()
        [Row(name='Alice'), Row(name='Tom'), Row(name=None)]

        """
        ...

[docs]    @dispatch_col_method
    def desc(self) -> "Column":
        """
        Returns a sort expression based on the descending order of the column.

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([('Tom', 80), ('Alice', None)], ["name", "height"])
        >>> df.select(df.name).orderBy(df.name.desc()).collect()
        [Row(name='Tom'), Row(name='Alice')]
        """
        ...

[docs]    @dispatch_col_method
    def desc_nulls_first(self) -> "Column":
        """
        Returns a sort expression based on the descending order of the column, and null values
        appear before non-null values.

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame(
        ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
        >>> df.select(df.name).orderBy(df.name.desc_nulls_first()).collect()
        [Row(name=None), Row(name='Tom'), Row(name='Alice')]

        """
        ...

[docs]    @dispatch_col_method
    def desc_nulls_last(self) -> "Column":
        """
        Returns a sort expression based on the descending order of the column, and null values
        appear after non-null values.

        .. versionadded:: 2.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame(
        ...     [('Tom', 80), (None, 60), ('Alice', None)], ["name", "height"])
        >>> df.select(df.name).orderBy(df.name.desc_nulls_last()).collect()
        [Row(name='Tom'), Row(name='Alice'), Row(name=None)]
        """
        ...

[docs]    @dispatch_col_method
    def isNull(self) -> "Column":
        """
        True if the current expression is null.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
        >>> df.filter(df.height.isNull()).collect()
        [Row(name='Alice', height=None)]
        """
        ...

[docs]    @dispatch_col_method
    def isNotNull(self) -> "Column":
        """
        True if the current expression is NOT null.

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame([Row(name='Tom', height=80), Row(name='Alice', height=None)])
        >>> df.filter(df.height.isNotNull()).collect()
        [Row(name='Tom', height=80)]
        """
        ...

[docs]    @dispatch_col_method
    def isNaN(self) -> "Column":
        """
        True if the current expression is NaN.

        .. versionadded:: 4.0.0

        Examples
        --------
        >>> from pyspark.sql import Row
        >>> df = spark.createDataFrame(
        ...     [Row(name='Tom', height=80.0), Row(name='Alice', height=float('nan'))])
        >>> df.filter(df.height.isNaN()).collect()
        [Row(name='Alice', height=nan)]
        """
        ...

[docs]    @dispatch_col_method
    def alias(self, *alias: str, **kwargs: Any) -> "Column":
        """
        Returns this column aliased with a new name or names (in the case of expressions that
        return more than one column, such as explode).

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        alias : str
            desired column names (collects all positional arguments passed)

        Other Parameters
        ----------------
        metadata: dict
            a dict of information to be stored in ``metadata`` attribute of the
            corresponding :class:`StructField <pyspark.sql.types.StructField>` (optional, keyword
            only argument)

            .. versionchanged:: 2.2.0
               Added optional ``metadata`` argument.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column is aliased with new name or names.

        Examples
        --------
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.select(df.age.alias("age2")).collect()
        [Row(age2=2), Row(age2=5)]
        >>> df.select(df.age.alias("age3", metadata={'max': 99})).schema['age3'].metadata['max']
        99
        """
        ...

[docs]    @dispatch_col_method
    def name(self, *alias: str, **kwargs: Any) -> "Column":
        """
        :func:`name` is an alias for :func:`alias`.

        .. versionadded:: 2.0.0
        """
        ...

[docs]    @dispatch_col_method
    def cast(self, dataType: Union[DataType, str]) -> "Column":
        """
        Casts the column into type ``dataType``.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        dataType : :class:`DataType` or str
            a DataType or Python string literal with a DDL-formatted string
            to use when parsing the column to the same type.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column is cast into new type.

        Examples
        --------
        >>> from pyspark.sql.types import StringType
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.select(df.age.cast("string").alias('ages')).collect()
        [Row(ages='2'), Row(ages='5')]
        >>> df.select(df.age.cast(StringType()).alias('ages')).collect()
        [Row(ages='2'), Row(ages='5')]
        """
        ...

[docs]    @dispatch_col_method
    def try_cast(self, dataType: Union[DataType, str]) -> "Column":
        """
        This is a special version of `cast` that performs the same operation, but returns a NULL
        value instead of raising an error if the invoke method throws exception.

        .. versionadded:: 4.0.0

        Parameters
        ----------
        dataType : :class:`DataType` or str
            a DataType or Python string literal with a DDL-formatted string
            to use when parsing the column to the same type.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column is cast into new type.

        Examples
        --------
        Example 1: Cast with a Datatype

        >>> from pyspark.sql.types import LongType
        >>> df = spark.createDataFrame(
        ...      [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"])
        >>> df.select(df.name.try_cast(LongType())).show()
        +----+
        |name|
        +----+
        | 123|
        |NULL|
        |NULL|
        +----+

        Example 2: Cast with a DDL string

        >>> df = spark.createDataFrame(
        ...      [(2, "123"), (5, "Bob"), (3, None)], ["age", "name"])
        >>> df.select(df.name.try_cast("double")).show()
        +-----+
        | name|
        +-----+
        |123.0|
        | NULL|
        | NULL|
        +-----+
        """
        ...

[docs]    @dispatch_col_method
    def astype(self, dataType: Union[DataType, str]) -> "Column":
        """
        :func:`astype` is an alias for :func:`cast`.

        .. versionadded:: 1.4.0
        """
        ...

[docs]    @dispatch_col_method
    def between(
        self,
        lowerBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"],
        upperBound: Union["Column", "LiteralType", "DateTimeLiteral", "DecimalLiteral"],
    ) -> "Column":
        """
        Check if the current column's values are between the specified lower and upper
        bounds, inclusive.

        .. versionadded:: 1.3.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        lowerBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
            The lower boundary value, inclusive.
        upperBound : :class:`Column`, int, float, string, bool, datetime, date or Decimal
            The upper boundary value, inclusive.

        Returns
        -------
        :class:`Column`
            A new column of boolean values indicating whether each element in the original
            column is within the specified range (inclusive).

        Examples
        --------
        Using between with integer values.

        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.select(df.name, df.age.between(2, 4)).show()
        +-----+---------------------------+
        | name|((age >= 2) AND (age <= 4))|
        +-----+---------------------------+
        |Alice|                       true|
        |  Bob|                      false|
        +-----+---------------------------+

        Using between with string values.

        >>> df = spark.createDataFrame([("Alice", "A"), ("Bob", "B")], ["name", "initial"])
        >>> df.select(df.name, df.initial.between("A", "B")).show()
        +-----+-----------------------------------+
        | name|((initial >= A) AND (initial <= B))|
        +-----+-----------------------------------+
        |Alice|                               true|
        |  Bob|                               true|
        +-----+-----------------------------------+

        Using between with float values.

        >>> df = spark.createDataFrame(
        ...     [(2.5, "Alice"), (5.5, "Bob")], ["height", "name"])
        >>> df.select(df.name, df.height.between(2.0, 5.0)).show()
        +-----+-------------------------------------+
        | name|((height >= 2.0) AND (height <= 5.0))|
        +-----+-------------------------------------+
        |Alice|                                 true|
        |  Bob|                                false|
        +-----+-------------------------------------+

        Using between with date values.

        >>> import pyspark.sql.functions as sf
        >>> df = spark.createDataFrame(
        ...     [("Alice", "2023-01-01"), ("Bob", "2023-02-01")], ["name", "date"])
        >>> df = df.withColumn("date", sf.to_date(df.date))
        >>> df.select(df.name, df.date.between("2023-01-01", "2023-01-15")).show()
        +-----+-----------------------------------------------+
        | name|((date >= 2023-01-01) AND (date <= 2023-01-15))|
        +-----+-----------------------------------------------+
        |Alice|                                           true|
        |  Bob|                                          false|
        +-----+-----------------------------------------------+
        >>> from datetime import date
        >>> df.select(df.name, df.date.between(date(2023, 1, 1), date(2023, 1, 15))).show()
        +-----+-------------------------------------------------------------+
        | name|((date >= DATE '2023-01-01') AND (date <= DATE '2023-01-15'))|
        +-----+-------------------------------------------------------------+
        |Alice|                                                         true|
        |  Bob|                                                        false|
        +-----+-------------------------------------------------------------+

        Using between with timestamp values.

        >>> import pyspark.sql.functions as sf
        >>> df = spark.createDataFrame(
        ...     [("Alice", "2023-01-01 10:00:00"), ("Bob", "2023-02-01 10:00:00")],
        ...     schema=["name", "timestamp"])
        >>> df = df.withColumn("timestamp", sf.to_timestamp(df.timestamp))
        >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01")).show()
        +-----+---------------------------------------------------------+
        | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01))|
        +-----+---------------------------------------------------------+
        |Alice|                                                     true|
        |  Bob|                                                    false|
        +-----+---------------------------------------------------------+
        >>> df.select(df.name, df.timestamp.between("2023-01-01", "2023-02-01 12:00:00")).show()
        +-----+------------------------------------------------------------------+
        | name|((timestamp >= 2023-01-01) AND (timestamp <= 2023-02-01 12:00:00))|
        +-----+------------------------------------------------------------------+
        |Alice|                                                              true|
        |  Bob|                                                              true|
        +-----+------------------------------------------------------------------+
        """
        ...

[docs]    @dispatch_col_method
    def when(self, condition: "Column", value: Any) -> "Column":
        """
        Evaluates a list of conditions and returns one of multiple possible result expressions.
        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

        .. versionadded:: 1.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        condition : :class:`Column`
            a boolean :class:`Column` expression.
        value
            a literal value, or a :class:`Column` expression.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column is in conditions.

        Examples
        --------
        Example 1: Using :func:`when` with conditions and values to create a new Column

        >>> from pyspark.sql import functions as sf
        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> result = df.select(df.name, sf.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0))
        >>> result.show()
        +-----+------------------------------------------------------------+
        | name|CASE WHEN (age > 4) THEN 1 WHEN (age < 3) THEN -1 ELSE 0 END|
        +-----+------------------------------------------------------------+
        |Alice|                                                          -1|
        |  Bob|                                                           1|
        +-----+------------------------------------------------------------+

        Example 2: Chaining multiple :func:`when` conditions

        >>> from pyspark.sql import functions as sf
        >>> df = spark.createDataFrame([(1, "Alice"), (4, "Bob"), (6, "Charlie")], ["age", "name"])
        >>> result = df.select(
        ...     df.name,
        ...     sf.when(df.age < 3, "Young").when(df.age < 5, "Middle-aged").otherwise("Old")
        ... )
        >>> result.show()
        +-------+---------------------------------------------------------------------------+
        |   name|CASE WHEN (age < 3) THEN Young WHEN (age < 5) THEN Middle-aged ELSE Old END|
        +-------+---------------------------------------------------------------------------+
        |  Alice|                                                                      Young|
        |    Bob|                                                                Middle-aged|
        |Charlie|                                                                        Old|
        +-------+---------------------------------------------------------------------------+

        Example 3: Using literal values as conditions

        >>> from pyspark.sql import functions as sf
        >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> result = df.select(
        ...     df.name, sf.when(sf.lit(True), 1).otherwise(
        ...         sf.raise_error("unreachable")).alias("when"))
        >>> result.show()
        +-----+----+
        | name|when|
        +-----+----+
        |Alice|   1|
        |  Bob|   1|
        +-----+----+

        See Also
        --------
        pyspark.sql.functions.when
        """
        ...

[docs]    @dispatch_col_method
    def otherwise(self, value: Any) -> "Column":
        """
        Evaluates a list of conditions and returns one of multiple possible result expressions.
        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

        .. versionadded:: 1.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        value
            a literal value, or a :class:`Column` expression.

        Returns
        -------
        :class:`Column`
            Column representing whether each element of Column is unmatched conditions.

        Examples
        --------
        >>> from pyspark.sql import functions as sf
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.select(df.name, sf.when(df.age > 3, 1).otherwise(0)).show()
        +-----+-------------------------------------+
        | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|
        +-----+-------------------------------------+
        |Alice|                                    0|
        |  Bob|                                    1|
        +-----+-------------------------------------+

        See Also
        --------
        pyspark.sql.functions.when
        """
        ...

[docs]    @dispatch_col_method
    def over(self, window: "WindowSpec") -> "Column":
        """
        Define a windowing column.

        .. versionadded:: 1.4.0

        .. versionchanged:: 3.4.0
            Supports Spark Connect.

        Parameters
        ----------
        window : :class:`WindowSpec`

        Returns
        -------
        :class:`Column`

        Examples
        --------
        >>> from pyspark.sql import Window
        >>> window = (
        ...     Window.partitionBy("name")
        ...     .orderBy("age")
        ...     .rowsBetween(Window.unboundedPreceding, Window.currentRow)
        ... )
        >>> from pyspark.sql.functions import rank, min, desc
        >>> df = spark.createDataFrame(
        ...      [(2, "Alice"), (5, "Bob")], ["age", "name"])
        >>> df.withColumn(
        ...      "rank", rank().over(window)
        ... ).withColumn(
        ...      "min", min('age').over(window)
        ... ).sort(desc("age")).show()
        +---+-----+----+---+
        |age| name|rank|min|
        +---+-----+----+---+
        |  5|  Bob|   1|  5|
        |  2|Alice|   1|  2|
        +---+-----+----+---+
        """
        ...

[docs]    @dispatch_col_method
    def transform(self, f: Callable[["Column"], "Column"]) -> "Column":
        """
        Applies a transformation function to this column.

        This method allows you to apply a function that takes a Column and returns a Column,
        enabling method chaining and functional transformations.

        .. versionadded:: 4.1.0

        Parameters
        ----------
        f : callable
            A function that takes a :class:`Column` and returns a :class:`Column`.

        Returns
        -------
        :class:`Column`
            The result of applying the function to this column.

        Examples
        --------
        Example 1: Chain built-in functions

        >>> from pyspark.sql.functions import trim, upper
        >>> df = spark.createDataFrame([("  hello  ",), ("  world  ",)], ["text"])
        >>> df.select(df.text.transform(trim).transform(upper).alias("result")).show()
        +------+
        |result|
        +------+
        | HELLO|
        | WORLD|
        +------+

        Example 2: Use lambda functions

        >>> df = spark.createDataFrame([(10,), (20,), (30,)], ["value"])
        >>> df.select(
        ...     df.value.transform(lambda c: c + 5)
        ...     .transform(lambda c: c * 2)
        ...     .transform(lambda c: c - 10).alias("result")
        ... ).show()
        +------+
        |result|
        +------+
        |    20|
        |    40|
        |    60|
        +------+
        """
        ...

[docs]    @dispatch_col_method
    def outer(self) -> "Column":
        """
        Mark this column as an outer column if its expression refers to columns from an outer query.

        This is used to trigger lazy analysis of Spark Classic DataFrame, so that we can use it
        to build subquery expressions. Spark Connect DataFrame is always lazily analyzed and
        does not need to use this function.

        .. versionadded:: 4.0.0

        See Also
        --------
        pyspark.sql.dataframe.DataFrame.scalar
        pyspark.sql.dataframe.DataFrame.exists
        """
        ...

    @dispatch_col_method
    def __nonzero__(self) -> None:
        ...

    @dispatch_col_method
    def __bool__(self) -> None:
        ...

    @dispatch_col_method
    def __repr__(self) -> str:
        ...


def _test() -> None:
    import doctest
    from pyspark.sql import SparkSession
    import pyspark.sql.column

    globs = pyspark.sql.column.__dict__.copy()
    spark = SparkSession.builder.master("local[4]").appName("sql.column tests").getOrCreate()
    globs["spark"] = spark

    (failure_count, test_count) = doctest.testmod(
        pyspark.sql.column,
        globs=globs,
        optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF,
    )
    spark.stop()
    if failure_count:
        sys.exit(-1)


if __name__ == "__main__":
    _test()