def _do_grouping(self):
"""Group the dataframe
"""
# First, do groupby on the first key by sorting on the first key.
# This will sort & shuffle the partitions.
firstkey = self._by[0]
df = self._df.sort_value(firstkey)
groups = df.to_delayed()
# Second, do groupby internally for each partition.
@delayed
def _groupby(df, by):
grouped = df.groupby(by=by)
ovdata = _extract_data_to_check_group_overlap(grouped, by)
return grouped, ovdata
grouped = [_groupby(g, self._by) for g in groups]
# Persist the groupby operation to avoid duplicating the work
grouped = persist(*grouped)
# Get the groupby objects
outgroups = list(map(delayed(operator.itemgetter(0)), grouped))
_check_group_non_overlap_assumption(grouped)
return outgroups
评论列表
文章目录