Skip to content
39 changes: 28 additions & 11 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import functools
import typing

import bigframes_vendored.ibis.expr.operations as vendored_ibis_ops
import ibis
import ibis.common.exceptions
import ibis.expr.datatypes as ibis_dtypes
Expand Down Expand Up @@ -737,7 +738,7 @@ def struct_field_op_impl(x: ibis_types.Value, op: ops.StructFieldOp):
return struct_value[name].name(name)


def numeric_to_datatime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
def numeric_to_datetime(x: ibis_types.Value, unit: str) -> ibis_types.TimestampValue:
if not isinstance(x, ibis_types.IntegerValue) and not isinstance(
x, ibis_types.FloatingValue
):
Expand Down Expand Up @@ -779,7 +780,7 @@ def astype_op_impl(x: ibis_types.Value, op: ops.AsTypeOp):
# with pandas converting int64[pyarrow] to timestamp[us][pyarrow],
# timestamp[us, tz=UTC][pyarrow], and time64[us][pyarrow].
unit = "us"
x_converted = numeric_to_datatime(x, unit)
x_converted = numeric_to_datetime(x, unit)
if to_type == ibis_dtypes.timestamp:
return x_converted.cast(ibis_dtypes.Timestamp())
elif to_type == ibis_dtypes.Timestamp(timezone="UTC"):
Expand Down Expand Up @@ -818,23 +819,39 @@ def isin_op_impl(x: ibis_types.Value, op: ops.IsInOp):
@scalar_op_compiler.register_unary_op(ops.ToDatetimeOp, pass_op=True)
def to_datetime_op_impl(x: ibis_types.Value, op: ops.ToDatetimeOp):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do worry this operation is getting a bit too complicated. Do the type rules reflect the fact that a DateTime will be returned for utc==False?.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will work on the refactor in a separate PR :-)

if x.type() == ibis_dtypes.str:
x = x.to_timestamp(op.format) if op.format else timestamp(x)
elif x.type() == ibis_dtypes.Timestamp(timezone="UTC"):
return vendored_ibis_ops.SafeCastToDatetime(x).to_expr()
else:
# Numerical inputs.
if op.format:
raise NotImplementedError(
f"Format parameter is not supported for Timestamp input types. {constants.FEEDBACK_LINK}"
)
return x
elif x.type() != ibis_dtypes.timestamp:
x = x.cast(ibis_dtypes.str).to_timestamp(op.format)
else:
# The default unit is set to "ns" (nanoseconds) for consistency
# with pandas, where "ns" is the default unit for datetime operations.
unit = op.unit or "ns"
x = numeric_to_datetime(x, unit)

return x.cast(ibis_dtypes.Timestamp(None))


@scalar_op_compiler.register_unary_op(ops.ToTimestampOp, pass_op=True)
def to_timestamp_op_impl(x: ibis_types.Value, op: ops.ToTimestampOp):
if x.type() == ibis_dtypes.str:
x = (
typing.cast(ibis_types.StringValue, x).to_timestamp(op.format)
if op.format
else timestamp(x)
)
else:
# Numerical inputs.
if op.format:
x = x.cast(ibis_dtypes.str).to_timestamp(op.format)
else:
# The default unit is set to "ns" (nanoseconds) for consistency
# with pandas, where "ns" is the default unit for datetime operations.
unit = op.unit or "ns"
x = numeric_to_datatime(x, unit)
x = numeric_to_datetime(x, unit)

return x.cast(ibis_dtypes.Timestamp(timezone="UTC" if op.utc else None))
return x.cast(ibis_dtypes.Timestamp(timezone="UTC"))


@scalar_op_compiler.register_unary_op(ops.RemoteFunctionOp, pass_op=True)
Expand Down
72 changes: 58 additions & 14 deletions bigframes/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import bigframes.constants as constants
import bigframes.dataframe
import bigframes.dtypes
import bigframes.operations as ops
import bigframes.series

Expand Down Expand Up @@ -51,25 +52,68 @@ def to_datetime(
f"to datetime is not implemented. {constants.FEEDBACK_LINK}"
)

arg = bigframes.series.Series(arg)
arg = bigframes.series.Series(arg)._cached()

if not utc and arg.dtype not in ("Int64", "Float64"): # type: ignore
raise NotImplementedError(
f"String and Timestamp requires utc=True. {constants.FEEDBACK_LINK}"
)

if format and unit and arg.dtype in ("Int64", "Float64"): # type: ignore
if format and unit and arg.dtype in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore
raise ValueError("cannot specify both format and unit")

if unit and arg.dtype not in ("Int64", "Float64"): # type: ignore
if unit and arg.dtype not in (bigframes.dtypes.INT_DTYPE, bigframes.dtypes.FLOAT_DTYPE): # type: ignore
raise NotImplementedError(
f"Unit parameter is not supported for non-numerical input types. {constants.FEEDBACK_LINK}"
)

return arg._apply_unary_op( # type: ignore
ops.ToDatetimeOp(
utc=utc,
format=format,
unit=unit,
if arg.dtype in (bigframes.dtypes.TIMESTAMP_DTYPE, bigframes.dtypes.DATETIME_DTYPE):
to_type = (
bigframes.dtypes.TIMESTAMP_DTYPE if utc else bigframes.dtypes.DATETIME_DTYPE
)
return arg._apply_unary_op(ops.AsTypeOp(to_type=to_type)) # type: ignore
if (not utc) and arg.dtype == bigframes.dtypes.STRING_DTYPE:
if format:
raise NotImplementedError(
f"Customized formats are not supported for string inputs when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}"
)

assert unit is None
as_datetime = arg._apply_unary_op( # type: ignore
ops.ToDatetimeOp(
format=format,
unit=unit,
)
)
failed_datetime_cast = arg.notnull() & as_datetime.isnull()
is_utc = arg._apply_unary_op(
ops.EndsWithOp(
pat=("Z", "-00:00", "+00:00", "-0000", "+0000", "-00", "+00")
)
)

# Cast to DATETIME shall succeed if all inputs are tz-naive.
if not failed_datetime_cast.any():
return as_datetime

if is_utc.all():
return arg._apply_unary_op( # type: ignore
ops.ToTimestampOp(
format=format,
unit=unit,
)
)

raise NotImplementedError(
f"Non-UTC string inputs are not supported when utc=False. Please set utc=True if possible. {constants.FEEDBACK_LINK}"
)
# If utc:
elif utc:
return arg._apply_unary_op( # type: ignore
ops.ToTimestampOp(
format=format,
unit=unit,
)
)
else:
return arg._apply_unary_op( # type: ignore
ops.ToDatetimeOp(
format=format,
unit=unit,
)
)
)
28 changes: 25 additions & 3 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import pandas as pd
import pyarrow as pa

import bigframes.dtypes
import bigframes.dtypes as dtypes
import bigframes.operations.type as op_typing

Expand Down Expand Up @@ -527,13 +528,34 @@ def output_type(self, *input_types):
@dataclasses.dataclass(frozen=True)
class ToDatetimeOp(UnaryOp):
name: typing.ClassVar[str] = "to_datetime"
utc: bool = False
format: typing.Optional[str] = None
unit: typing.Optional[str] = None

def output_type(self, *input_types):
timezone = "UTC" if self.utc else None
return pd.ArrowDtype(pa.timestamp("us", tz=timezone))
if input_types[0] not in (
bigframes.dtypes.FLOAT_DTYPE,
bigframes.dtypes.INT_DTYPE,
bigframes.dtypes.STRING_DTYPE,
):
raise TypeError("expected string or numeric input")
return pd.ArrowDtype(pa.timestamp("us", tz=None))


@dataclasses.dataclass(frozen=True)
class ToTimestampOp(UnaryOp):
name: typing.ClassVar[str] = "to_timestamp"
format: typing.Optional[str] = None
unit: typing.Optional[str] = None

def output_type(self, *input_types):
# Must be numeric or string
if input_types[0] not in (
bigframes.dtypes.FLOAT_DTYPE,
bigframes.dtypes.INT_DTYPE,
bigframes.dtypes.STRING_DTYPE,
):
raise TypeError("expected string or numeric input")
return pd.ArrowDtype(pa.timestamp("us", tz="UTC"))


@dataclasses.dataclass(frozen=True)
Expand Down
96 changes: 96 additions & 0 deletions tests/system/small/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,3 +634,99 @@ def test_to_datetime_format_param(arg, utc, format):
pd.testing.assert_series_equal(
bf_result, pd_result, check_index_type=False, check_names=False
)


@pytest.mark.parametrize(
("arg", "utc", "output_in_utc", "format"),
[
(
["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"],
False,
False,
None,
),
(
[
"2008-12-25 05:30:00Z",
"2008-12-25 05:30:00-00:00",
"2008-12-25 05:30:00+00:00",
"2008-12-25 05:30:00-0000",
"2008-12-25 05:30:00+0000",
"2008-12-25 05:30:00-00",
"2008-12-25 05:30:00+00",
],
False,
True,
None,
),
(
["2014-08-15 08:15:12", "2011-08-15 08:15:12", "2015-08-15 08:15:12"],
True,
True,
"%Y-%m-%d %H:%M:%S",
),
(
[
"2014-08-15 08:15:12+05:00",
"2011-08-15 08:15:12+05:00",
"2015-08-15 08:15:12+05:00",
],
True,
True,
None,
),
],
)
def test_to_datetime_string_inputs(arg, utc, output_in_utc, format):
bf_result = (
bpd.to_datetime(arg, utc=utc, format=format)
.to_pandas()
.astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]")
)
pd_result = pd.Series(pd.to_datetime(arg, utc=utc, format=format)).dt.floor("us")
pd.testing.assert_series_equal(
bf_result, pd_result, check_index_type=False, check_names=False
)


@pytest.mark.parametrize(
("arg", "utc", "output_in_utc"),
[
(
[datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)],
False,
False,
),
(
[datetime(2023, 1, 1, 12, 0), datetime(2023, 2, 1, 12, 0)],
True,
True,
),
(
[
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
],
True,
True,
),
(
[
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("America/New_York")),
datetime(2023, 1, 1, 12, 0, tzinfo=pytz.timezone("UTC")),
],
True,
True,
),
],
)
def test_to_datetime_timestamp_inputs(arg, utc, output_in_utc):
bf_result = (
bpd.to_datetime(arg, utc=utc)
.to_pandas()
.astype("datetime64[ns, UTC]" if output_in_utc else "datetime64[ns]")
)
pd_result = pd.Series(pd.to_datetime(arg, utc=utc)).dt.floor("us")
pd.testing.assert_series_equal(
bf_result, pd_result, check_index_type=False, check_names=False
)
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def _generate_array(translator, op: vendored_ibis_ops.GenerateArray):
return f"GENERATE_ARRAY(0, {arg})"


def _safe_cast_to_datetime(translator, op: vendored_ibis_ops.SafeCastToDatetime):
arg = translator.translate(op.arg)
return f"SAFE_CAST({arg} AS DATETIME)"


def _quantile(translator, op: ibis_reductions.Quantile):
arg = translator.translate(op.arg)
quantile = translator.translate(op.quantile)
Expand All @@ -44,6 +49,7 @@ def _quantile(translator, op: ibis_reductions.Quantile):
vendored_ibis_ops.LastNonNullValue: _last_non_null_value, # type:ignore
vendored_ibis_ops.ToJsonString: _to_json_string, # type:ignore
vendored_ibis_ops.GenerateArray: _generate_array, # type:ignore
vendored_ibis_ops.SafeCastToDatetime: _safe_cast_to_datetime, # type:ignore
ibis_reductions.Quantile: _quantile, # type:ignore
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@

class GenerateArray(Unary):
dtype = dt.Array(dt.int64)


class SafeCastToDatetime(Unary):
dtype = dt.Timestamp(timezone=None)