Skip to content

Commit 8d81e24

Browse files
feat: add DataFrame from_dict and from_records methods (#244)
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) Fixes #<issue_number_goes_here> 🦕
1 parent 0523a31 commit 8d81e24

File tree

3 files changed

+149
-0
lines changed

3 files changed

+149
-0
lines changed

bigframes/dataframe.py

+26
Original file line numberDiff line numberDiff line change
@@ -2381,6 +2381,32 @@ def _split(
23812381
blocks = self._block._split(ns=ns, fracs=fracs, random_state=random_state)
23822382
return [DataFrame(block) for block in blocks]
23832383

2384+
@classmethod
2385+
def from_dict(
2386+
cls,
2387+
data: dict,
2388+
orient: str = "columns",
2389+
dtype=None,
2390+
columns=None,
2391+
) -> DataFrame:
2392+
return cls(pandas.DataFrame.from_dict(data, orient, dtype, columns)) # type: ignore
2393+
2394+
@classmethod
2395+
def from_records(
2396+
cls,
2397+
data,
2398+
index=None,
2399+
exclude=None,
2400+
columns=None,
2401+
coerce_float: bool = False,
2402+
nrows: int | None = None,
2403+
) -> DataFrame:
2404+
return cls(
2405+
pandas.DataFrame.from_records(
2406+
data, index, exclude, columns, coerce_float, nrows
2407+
)
2408+
)
2409+
23842410
def to_csv(
23852411
self, path_or_buf: str, sep=",", *, header: bool = True, index: bool = True
23862412
) -> None:

tests/system/small/test_dataframe.py

+48
Original file line numberDiff line numberDiff line change
@@ -3309,6 +3309,54 @@ def test_df_duplicated(scalars_df_index, scalars_pandas_df_index, keep, subset):
33093309
pd.testing.assert_series_equal(pd_series, bf_series, check_dtype=False)
33103310

33113311

3312+
def test_df_from_dict_columns_orient():
3313+
data = {"a": [1, 2], "b": [3.3, 2.4]}
3314+
bf_result = dataframe.DataFrame.from_dict(data, orient="columns").to_pandas()
3315+
pd_result = pd.DataFrame.from_dict(data, orient="columns")
3316+
assert_pandas_df_equal(
3317+
pd_result, bf_result, check_dtype=False, check_index_type=False
3318+
)
3319+
3320+
3321+
def test_df_from_dict_index_orient():
3322+
data = {"a": [1, 2], "b": [3.3, 2.4]}
3323+
bf_result = dataframe.DataFrame.from_dict(
3324+
data, orient="index", columns=["col1", "col2"]
3325+
).to_pandas()
3326+
pd_result = pd.DataFrame.from_dict(data, orient="index", columns=["col1", "col2"])
3327+
assert_pandas_df_equal(
3328+
pd_result, bf_result, check_dtype=False, check_index_type=False
3329+
)
3330+
3331+
3332+
def test_df_from_dict_tight_orient():
3333+
data = {
3334+
"index": [("i1", "i2"), ("i3", "i4")],
3335+
"columns": ["col1", "col2"],
3336+
"data": [[1, 2.6], [3, 4.5]],
3337+
"index_names": ["in1", "in2"],
3338+
"column_names": ["column_axis"],
3339+
}
3340+
3341+
bf_result = dataframe.DataFrame.from_dict(data, orient="tight").to_pandas()
3342+
pd_result = pd.DataFrame.from_dict(data, orient="tight")
3343+
assert_pandas_df_equal(
3344+
pd_result, bf_result, check_dtype=False, check_index_type=False
3345+
)
3346+
3347+
3348+
def test_df_from_records():
3349+
records = ((1, "a"), (2.5, "b"), (3.3, "c"), (4.9, "d"))
3350+
3351+
bf_result = dataframe.DataFrame.from_records(
3352+
records, columns=["c1", "c2"]
3353+
).to_pandas()
3354+
pd_result = pd.DataFrame.from_records(records, columns=["c1", "c2"])
3355+
assert_pandas_df_equal(
3356+
pd_result, bf_result, check_dtype=False, check_index_type=False
3357+
)
3358+
3359+
33123360
def test_df_to_dict(scalars_df_index, scalars_pandas_df_index):
33133361
unsupported = ["numeric_col"] # formatted differently
33143362
bf_result = scalars_df_index.drop(columns=unsupported).to_dict()

third_party/bigframes_vendored/pandas/core/frame.py

+75
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,81 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
196196

197197
# ----------------------------------------------------------------------
198198
# IO methods (to / from other formats)
199+
@classmethod
200+
def from_dict(
201+
cls,
202+
data: dict,
203+
orient="columns",
204+
dtype=None,
205+
columns=None,
206+
) -> DataFrame:
207+
"""
208+
Construct DataFrame from dict of array-like or dicts.
209+
210+
Creates DataFrame object from dictionary by columns or by index
211+
allowing dtype specification.
212+
213+
Args:
214+
data (dict):
215+
Of the form {field : array-like} or {field : dict}.
216+
orient ({'columns', 'index', 'tight'}, default 'columns'):
217+
The "orientation" of the data. If the keys of the passed dict
218+
should be the columns of the resulting DataFrame, pass 'columns'
219+
(default). Otherwise if the keys should be rows, pass 'index'.
220+
If 'tight', assume a dict with keys ['index', 'columns', 'data',
221+
'index_names', 'column_names'].
222+
dtype (dtype, default None):
223+
Data type to force after DataFrame construction, otherwise infer.
224+
columns (list, default None):
225+
Column labels to use when ``orient='index'``. Raises a ValueError
226+
if used with ``orient='columns'`` or ``orient='tight'``.
227+
228+
Returns:
229+
DataFrame
230+
"""
231+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
232+
233+
@classmethod
234+
def from_records(
235+
cls,
236+
data,
237+
index=None,
238+
exclude=None,
239+
columns=None,
240+
coerce_float: bool = False,
241+
nrows: int | None = None,
242+
) -> DataFrame:
243+
"""
244+
Convert structured or record ndarray to DataFrame.
245+
246+
Creates a DataFrame object from a structured ndarray, sequence of
247+
tuples or dicts, or DataFrame.
248+
249+
Args:
250+
data (structured ndarray, sequence of tuples or dicts):
251+
Structured input data.
252+
index (str, list of fields, array-like):
253+
Field of array to use as the index, alternately a specific set of
254+
input labels to use.
255+
exclude (sequence, default None):
256+
Columns or fields to exclude.
257+
columns (sequence, default None):
258+
Column names to use. If the passed data do not have names
259+
associated with them, this argument provides names for the
260+
columns. Otherwise this argument indicates the order of the columns
261+
in the result (any names not found in the data will become all-NA
262+
columns).
263+
coerce_float (bool, default False):
264+
Attempt to convert values of non-string, non-numeric objects (like
265+
decimal.Decimal) to floating point, useful for SQL result sets.
266+
nrows (int, default None):
267+
Number of rows to read if data is an iterator.
268+
269+
Returns:
270+
DataFrame
271+
"""
272+
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
273+
199274
def to_numpy(self, dtype=None, copy=False, na_value=None, **kwargs) -> np.ndarray:
200275
"""
201276
Convert the DataFrame to a NumPy array.

0 commit comments

Comments
 (0)