Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 51 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import pyarrow as pa

import bigframes._config.sampling_options as sampling_options
import bigframes.constants
import bigframes.constants as constants
import bigframes.core as core
import bigframes.core.expression as ex
Expand Down Expand Up @@ -1542,6 +1543,10 @@ def melt(
var_names=typing.Sequence[typing.Hashable],
value_name: typing.Hashable = "value",
):
"""
Unpivot columns to produce longer, narrower dataframe.
Arguments correspond to pandas.melt arguments.
"""
# TODO: Implement col_level and ignore_index
unpivot_col_id = guid.generate_guid()
var_col_ids = tuple([guid.generate_guid() for _ in var_names])
Expand Down Expand Up @@ -1570,6 +1575,52 @@ def melt(
index_columns=[index_id],
)

def transpose(self) -> Block:
"""Transpose the block. Will fail if dtypes aren't coercible to a common type or too many rows"""
original_col_index = self.column_labels
original_row_index = self.index.to_pandas()
original_row_count = len(original_row_index)
if original_row_count > bigframes.constants.MAX_COLUMNS:
raise NotImplementedError(
f"Object has {original_row_count} rows and is too large to transpose."
)

# Add row numbers to both axes to disambiguate, clean them up later
block = self
numbered_block = block.with_column_labels(
utils.combine_indices(
block.column_labels, pd.Index(range(len(block.column_labels)))
)
)
numbered_block, offsets = numbered_block.promote_offsets()

stacked_block = numbered_block.melt(
id_vars=(offsets,),
var_names=(
*[name for name in original_col_index.names],
"col_offset",
),
value_vars=block.value_columns,
)
col_labels = stacked_block.value_columns[-2 - original_col_index.nlevels : -2]
col_offset = stacked_block.value_columns[-2] # disambiguator we created earlier
cell_values = stacked_block.value_columns[-1]
# Groupby source column
stacked_block = stacked_block.set_index(
[*col_labels, col_offset]
) # col index is now row index
result = stacked_block.pivot(
columns=[offsets],
values=[cell_values],
columns_unique_values=tuple(range(original_row_count)),
)
# Drop the offsets from both axes before returning
return (
result.with_column_labels(original_row_index)
.order_by([ordering.ascending_over(result.index_columns[-1])])
.drop_levels([result.index_columns[-1]])
)

def _create_stack_column(
self, col_label: typing.Tuple, stack_labels: typing.Sequence[typing.Tuple]
):
Expand Down
7 changes: 7 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,13 @@ def bqclient(self) -> bigframes.Session:
def _session(self) -> bigframes.Session:
return self._get_block().expr.session

@property
def T(self) -> DataFrame:
return DataFrame(self._get_block().transpose())

def transpose(self) -> DataFrame:
return self.T

def __len__(self):
rows, _ = self.shape
return rows
Expand Down
23 changes: 23 additions & 0 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2465,6 +2465,29 @@ def test_df_describe(scalars_dfs):
).all()


def test_df_transpose():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's a cheap test, we could add a check that we raise TypeError for mixing strs and floats.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

# Include some floats to ensure type coercion
values = [[0, 3.5, True], [1, 4.5, False], [2, 6.5, None]]
# Test complex case of both axes being multi-indices with non-unique elements
columns = pd.Index(["A", "B", "A"], dtype=pd.StringDtype(storage="pyarrow"))
columns_multi = pd.MultiIndex.from_arrays([columns, columns], names=["c1", "c2"])
index = pd.Index(["b", "a", "a"], dtype=pd.StringDtype(storage="pyarrow"))
rows_multi = pd.MultiIndex.from_arrays([index, index], names=["r1", "r2"])

pd_df = pandas.DataFrame(values, index=rows_multi, columns=columns_multi)
bf_df = dataframe.DataFrame(values, index=rows_multi, columns=columns_multi)

pd_result = pd_df.T
bf_result = bf_df.T.to_pandas()

pd.testing.assert_frame_equal(pd_result, bf_result, check_dtype=False)


def test_df_transpose_error():
with pytest.raises(TypeError, match="Cannot coerce.*to a common type."):
dataframe.DataFrame([[1, "hello"], [2, "world"]]).transpose()


@pytest.mark.parametrize(
("ordered"),
[
Expand Down
82 changes: 82 additions & 0 deletions third_party/bigframes_vendored/pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,88 @@ def values(self) -> np.ndarray:
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

@property
def T(self) -> DataFrame:
"""
The transpose of the DataFrame.

All columns must be the same dtype (numerics can be coerced to a common supertype).

**Examples:**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None
>>> df = bpd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
>>> df
col1 col2
0 1 3
1 2 4
<BLANKLINE>
[2 rows x 2 columns]

>>> df.T
0 1
col1 1 2
col2 3 4
<BLANKLINE>
[2 rows x 2 columns]

Returns:
DataFrame: The transposed DataFrame.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def transpose(self) -> DataFrame:
"""
Transpose index and columns.

Reflect the DataFrame over its main diagonal by writing rows as columns
and vice-versa. The property :attr:`.T` is an accessor to the method
:meth:`transpose`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add something here about the dtypes requirements? (Columns must be coerceable to a common type) (I know it's below but seems important enough to be up top.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added


All columns must be the same dtype (numerics can be coerced to a common supertype).

**Examples:**

**Square DataFrame with homogeneous dtype**

>>> import bigframes.pandas as bpd
>>> bpd.options.display.progress_bar = None

>>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
>>> df1 = bpd.DataFrame(data=d1)
>>> df1
col1 col2
0 1 3
1 2 4
<BLANKLINE>
[2 rows x 2 columns]

>>> df1_transposed = df1.T # or df1.transpose()
>>> df1_transposed
0 1
col1 1 2
col2 3 4
<BLANKLINE>
[2 rows x 2 columns]

When the dtype is homogeneous in the original DataFrame, we get a
transposed DataFrame with the same dtype:

>>> df1.dtypes
col1 Int64
col2 Int64
dtype: object
>>> df1_transposed.dtypes
0 Int64
1 Int64
dtype: object

Returns:
DataFrame: The transposed DataFrame.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def info(
self,
verbose: bool | None = None,
Expand Down