mik*_*123 6 python lambda pandas
这个问题与我之前发布的问题非常相似,只有一处更改。我不仅想计算所有列的绝对差异,还想找到“Z”列的幅度差异,因此如果当前 Z 比前一个大 1.1 倍,则保留它。
(问题的更多背景)
df = pd.DataFrame({
    'rank': [1, 1, 2, 2, 3, 3],
    'x': [0, 3, 0, 3, 4, 2],
    'y': [0, 4, 0, 4, 5, 5],
    'z': [1, 3, 1.2, 3.25, 3, 6],
})
print(df)
#    rank  x  y     z
# 0     1  0  0  1.00
# 1     1  3  4  3.00
# 2     2  0  0  1.20
# 3     2  3  4  3.25
# 4     3  4  5  3.00
# 5     3  2  5  6.00
这就是我想要的输出
output = pd.DataFrame({
    'rank': [1, 1, 2, 3],
    'x': [0, 3, 0, 2],
    'y': [0, 4, 0, 5],
    'z': [1, 3, 1.2, 6],
})
print(output)
#    rank  x  y    z
# 0     1  0  0  1.0
# 1     1  3  4  3.0
# 2     2  0  0  1.2
# 5     3  2  5  6.00
基本上我想要发生的是,如果前一个排名有任何带有 x, y (+- 1 双向)和 z (<1.1z) 的行来删除它。
因此,对于排名 1 的行,排名 2 中的任意行具有 x = (-1-1)、y = (-1-1)、z= (<1.1) 或 x = (2-5)、y 的任意组合= (3-5), z= (<3.3) 我希望将其删除
这是使用numpy 广播的解决方案:
\n# Initially, no row is dropped\ndf[\'drop\'] = False\n\nfor r in range(df[\'rank\'].min(), df[\'rank\'].max()):\n    # Find the x_min, x_max, y_min, y_max, z_max of the current rank\n    cond = df[\'rank\'] == r\n    x, y, z = df.loc[cond, [\'x\',\'y\',\'z\']].to_numpy().T\n    x_min, x_max = x + [[-1], [1]] # use numpy broadcasting to \xc2\xb11 in one command\n    y_min, y_max = y + [[-1], [1]]\n    z_max        = z * 1.1\n\n    # Find the x, y, z of the next rank. Raise them one dimension\n    # so that we can make a comparison matrix again x_min, x_max, ...\n    cond = df[\'rank\'] == r + 1\n    if not cond.any():\n        continue\n    x, y, z = df.loc[cond, [\'x\',\'y\',\'z\']].to_numpy().T[:, :, None]\n\n    # Condition to drop a row\n    drop = (\n        (x_min <= x) & (x <= x_max) &\n        (y_min <= y) & (y <= y_max) &\n        (z <= z_max)\n    ).any(axis=1)\n    df.loc[cond, \'drop\'] = drop\n\n# Result\ndf[~df[\'drop\']]\n一个更精简的版本(并且可能更快)。这是让你未来的队友在阅读代码时感到困惑的好方法:
\nr, x, y, z = df[[\'rank\', \'x\', \'y\', \'z\']].T.to_numpy()\nrr, xx, yy, zz = [col[:,None] for col in [r, x, y, z]]\n\ndrop = (\n    (rr == r + 1) &\n    (x-1 <= xx) & (xx <= x+1) &\n    (y-1 <= yy) & (yy <= y+1) &\n    (zz <= z*1.1)\n).any(axis=1)\n\n# Result\ndf[~drop]\n其作用是将每一行相互比较df(包括其自身),并在以下情况下返回 True(即丢弃):
rank== 另一行rank + 1;和x, y, z落在其他行的指定范围内x, y, z我修改了mozway的功能,使其能够按照您的要求工作。
# comparing 'equal' float values, may go wrong, that's why I am using this constant
DELTA=0.1**12
def check_previous_group(rank, d, groups):
    if not rank-1 in groups.groups:
        # check if a previous group exists, else flag all rows False (i.e. not to be dropped)
        #return pd.Series(False, index=d.index)
        return pd.Series(False, index=d.index)
    else:
        # get previous group (rank-1)
        d_prev = groups.get_group(rank-1)
        # get the absolute difference per row with the whole dataset 
        # of the previous group: abs(d_prev-s)
        # if differences in x and y are within 1 and z < 1.1*x
        # for at least one row of the previous group
        # then flag the row to be dropped (True)
        
        return d.apply(lambda s: (abs(d_prev-s)[['x', 'y']].le([1,1]).all(1)&
                                  (s['z']<1.1*d_prev['x']-DELTA)).any(), axis=1)
测试,
>>> df = pd.DataFrame({
    'rank': [1, 1, 2, 2, 3, 3],
    'x': [0, 3, 0, 3, 4, 2],
    'y': [0, 4, 0, 4, 5, 5],
    'z': [1, 3, 1.2, 3.25, 3, 6],
})
>>> df
   rank  x  y     z
0     1  0  0  1.00
1     1  3  4  3.00
2     2  0  0  1.20
3     2  3  4  3.25
4     3  4  5  3.00
5     3  2  5  6.00
>>> groups = df.groupby('rank')
>>> mask = pd.concat([check_previous_group(rank, d, groups) for rank,d in groups])
>>> df[~mask]
   rank  x  y    z
0     1  0  0  1.0
1     1  3  4  3.0
2     2  0  0  1.2
5     3  2  5  6.0
>>> df = pd.DataFrame({
    'rank': [1, 1, 2, 2, 3, 3],
    'x': [0, 3, 0, 3, 4, 2],
    'y': [0, 4, 0, 4, 5, 5],
    'z': [1, 3, 1.2, 3.3, 3, 6],
})
>>> df
   rank  x  y    z
0     1  0  0  1.0
1     1  3  4  3.0
2     2  0  0  1.2
3     2  3  4  3.3
4     3  4  5  3.0
5     3  2  5  6.0
>>> groups = df.groupby('rank')
>>> mask = pd.concat([check_previous_group(rank, d, groups) for rank,d in groups])
>>> df[~mask]
   rank  x  y    z
0     1  0  0  1.0
1     1  3  4  3.0
2     2  0  0  1.2
3     2  3  4  3.3
5     3  2  5  6.0