admin管理员组文章数量:1313925
I am trying to create a rolling rank column for a float column. However the output is not as expected. Below I have an example:
data = {
'time': pd.date_range(start='2025-01-01', periods=5, freq='H'),
'zone': ['A'] * 5,
'price': [1.0, 1.5, 1.7, 1.9, 2.0],
}
dummy_df = pd.DataFrame(data)
class RollingRank:
def __init__(self, rank_col: str, group_by_col: str, window_length: int, time_col: str):
self.rank_col = rank_col
self.time_col = time_col
self.group_by_col = group_by_col
self.window_length = window_length
def fit_transform(self, df):
df = df.sort_values(by=[self.group_by_col, self.time_col])
df.set_index(self.time_col, inplace=True)
dfg = (
df.groupby(self.group_by_col)
.rolling(window=f"{self.window_length}H", center=True, closed="both")
.rank(method="min")
)
df.loc[:, f"{self.rank_col}_rank"] = dfg[self.rank_col].values
return df
df_rank = RollingRank(rank_col="price", group_by_col="zone", window_length=3, time_col="time").fit_transform(dummy_df)
As an output I get the rank [2.0, 3.0, 3.0, 3.0, 3.0]
which does not make sense with center=True
and closed="both"
.
As an easy example for the middel row with the timestamp 2025-01-01 02:00:00
I would expect the rank to be 2 (and not 3 as in the output) as the values used for ranking would be [1.5, 1.7, 1.9]
.
Any ideas of what I am doing wrong?
I am trying to create a rolling rank column for a float column. However the output is not as expected. Below I have an example:
data = {
'time': pd.date_range(start='2025-01-01', periods=5, freq='H'),
'zone': ['A'] * 5,
'price': [1.0, 1.5, 1.7, 1.9, 2.0],
}
dummy_df = pd.DataFrame(data)
class RollingRank:
def __init__(self, rank_col: str, group_by_col: str, window_length: int, time_col: str):
self.rank_col = rank_col
self.time_col = time_col
self.group_by_col = group_by_col
self.window_length = window_length
def fit_transform(self, df):
df = df.sort_values(by=[self.group_by_col, self.time_col])
df.set_index(self.time_col, inplace=True)
dfg = (
df.groupby(self.group_by_col)
.rolling(window=f"{self.window_length}H", center=True, closed="both")
.rank(method="min")
)
df.loc[:, f"{self.rank_col}_rank"] = dfg[self.rank_col].values
return df
df_rank = RollingRank(rank_col="price", group_by_col="zone", window_length=3, time_col="time").fit_transform(dummy_df)
As an output I get the rank [2.0, 3.0, 3.0, 3.0, 3.0]
which does not make sense with center=True
and closed="both"
.
As an easy example for the middel row with the timestamp 2025-01-01 02:00:00
I would expect the rank to be 2 (and not 3 as in the output) as the values used for ranking would be [1.5, 1.7, 1.9]
.
Any ideas of what I am doing wrong?
Share Improve this question asked Jan 31 at 14:53 andKaaeandKaae 3235 silver badges20 bronze badges 1- 2025-01-01 00:00:00 A 1.0 NaN, 2025-01-01 01:00:00 A 1.5 3.0, 2025-01-01 02:00:00 A 1.7 4.0, 2025-01-01 03:00:00 A 1.9 4.0, 2025-01-01 04:00:00 A 2.0 3.0 , Would you like to get this result? – Subir Chowdhury Commented Jan 31 at 15:19
1 Answer
Reset to default 0Cf. this post, which notes the same problem with plain df.rolling
. I would suggest creating an iterator with the timestamps using iter
and selecting the correct timestamp for each window rank via next
inside rolling.apply
. Basically:
# adding another group, with prices reversed
dummy_df = pd.concat([dummy_df, dummy_df.assign(
zone='B', price=dummy_df['price'].values[::-1]
)], ignore_index=True)
gr_by = dummy_df.set_index('time').groupby('zone')
ts_iter = iter(dummy_df.index.tolist())
s = (gr_by.rolling(window='3h', center=True, closed='both')['price']
.apply(lambda x: x.rank(method='min')[next(ts_iter)])
.rename('price_rank')
)
Output:
zone time
A 2025-01-01 00:00:00 1.0
2025-01-01 01:00:00 2.0
2025-01-01 02:00:00 2.0
2025-01-01 03:00:00 2.0
2025-01-01 04:00:00 2.0
B 2025-01-01 00:00:00 2.0
2025-01-01 01:00:00 2.0
2025-01-01 02:00:00 2.0
2025-01-01 03:00:00 2.0
2025-01-01 04:00:00 1.0
Name: price_rank, dtype: float64
You can then use df.merge
to add the result to dummy_df
. So, for your class
that could be:
class RollingRank:
def __init__(self, rank_col: str, group_by_col: str, window_length: int,
time_col: str):
self.rank_col = rank_col
self.time_col = time_col
self.group_by_col = group_by_col
self.window_length = window_length
def fit_transform(self, df):
df = df.sort_values(by=[self.group_by_col, self.time_col])
df.set_index(self.time_col, inplace=True)
gr_by = df.groupby(self.group_by_col)
ts_iter = iter(df.index.tolist())
dfg = (gr_by.rolling(window=f'{self.window_length}h',
center=True,
closed='both')['price']
.apply(lambda x: x.rank(method='min')[next(ts_iter)])
.rename(f'{self.rank_col}_rank')
)
df = df.merge(dfg, on=['zone', 'time'], how='left')
return df
df_rank = (RollingRank(rank_col="price",
group_by_col="zone",
window_length=3,
time_col="time")
.fit_transform(dummy_df)
)
本文标签: pythonPandas rolling rank issueStack Overflow
版权声明:本文标题:python - Pandas rolling rank issue - Stack Overflow 内容由网友自发贡献,该文观点仅代表作者本人, 转载请联系作者并注明出处:http://www.betaflare.com/web/1741912479a2404525.html, 本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌抄袭侵权/违法违规的内容,一经查实,本站将立刻删除。
发表评论