chr*_*ris 5 python numpy dataframe pandas
我试图将两个pandas数据帧相互增加.具体来说,我想将每列与其他df的每一列相乘.
数据帧是单热编码的,因此它们看起来像这样:
col_1, col_2, col_3, ...
0 1 0
1 0 0
0 0 1
...
Run Code Online (Sandbox Code Playgroud)
我可以使用for循环迭代每个列,但在python中计算成本很高,我希望有一种更简单的方法.
其中一个数据帧有500列,另一个有100列.
这是迄今为止我能写的最快的版本:
interact_pd = pd.DataFrame(index=df_1.index)
df1_columns = [column for column in df_1]
for column in df_2:
col_pd = df_1[df1_columns].multiply(df_2[column], axis="index")
interact_pd = interact_pd.join(col_pd, lsuffix='_' + column)
Run Code Online (Sandbox Code Playgroud)
我遍历df_2中的每一列并将该列的所有df_1相乘,然后将结果追加到interact_pd.我宁愿不使用for循环来做,因为这在计算上非常昂贵.有更快的方法吗?
编辑:例子
df_1:
1col_1, 1col_2, 1col_3
0 1 0
1 0 0
0 0 1
Run Code Online (Sandbox Code Playgroud)
df_2:
2col_1, 2col_2
0 1
1 0
0 0
Run Code Online (Sandbox Code Playgroud)
interact_pd:
1col_1_2col_1, 1col_2_2col_1,1col_3_2col_1, 1col_1_2col_2, 1col_2_2col_2,1col_3_2col_2
0 0 0 0 1 0
1 0 0 0 0 0
0 0 0 0 0 0
Run Code Online (Sandbox Code Playgroud)
# use numpy to get a pair of indices that map out every
# combination of columns from df_1 and columns of df_2
pidx = np.indices((df_1.shape[1], df_2.shape[1])).reshape(2, -1)
# use pandas MultiIndex to create a nice MultiIndex for
# the final output
lcol = pd.MultiIndex.from_product([df_1.columns, df_2.columns],
names=[df_1.columns.name, df_2.columns.name])
# df_1.values[:, pidx[0]] slices df_1 values for every combination
# like wise with df_2.values[:, pidx[1]]
# finally, I marry up the product of arrays with the MultiIndex
pd.DataFrame(df_1.values[:, pidx[0]] * df_2.values[:, pidx[1]],
columns=lcol)
Run Code Online (Sandbox Code Playgroud)
码
from string import ascii_letters
df_1 = pd.DataFrame(np.random.randint(0, 2, (1000, 26)), columns=list(ascii_letters[:26]))
df_2 = pd.DataFrame(np.random.randint(0, 2, (1000, 52)), columns=list(ascii_letters))
def pir1(df_1, df_2):
pidx = np.indices((df_1.shape[1], df_2.shape[1])).reshape(2, -1)
lcol = pd.MultiIndex.from_product([df_1.columns, df_2.columns],
names=[df_1.columns.name, df_2.columns.name])
return pd.DataFrame(df_1.values[:, pidx[0]] * df_2.values[:, pidx[1]],
columns=lcol)
def Test2(DA,DB):
MA = DA.as_matrix()
MB = DB.as_matrix()
MM = np.zeros((len(MA),len(MA[0])*len(MB[0])))
Col = []
for i in range(len(MB[0])):
for j in range(len(MA[0])):
MM[:,i*len(MA[0])+j] = MA[:,j]*MB[:,i]
Col.append('1col_'+str(i+1)+'_2col_'+str(j+1))
return pd.DataFrame(MM,dtype=int,columns=Col)
Run Code Online (Sandbox Code Playgroud)
结果
您可以沿着index轴的第一df列乘以第二列的每一列df,这是大数据集的最快方法(见下文):
df = pd.concat([df_1.mul(col[1], axis="index") for col in df_2.iteritems()], axis=1)
# Change the name of the columns
df.columns = ["_".join([i, j]) for j in df_2.columns for i in df_1.columns]
df
1col_1_2col_1 1col_2_2col_1 1col_3_2col_1 1col_1_2col_2 \
0 0 0 0 0
1 1 0 0 0
2 0 0 0 0
1col_2_2col_2 1col_3_2col_2
0 1 0
1 0 0
2 0 0
Run Code Online (Sandbox Code Playgroud)
def Test2(DA,DB):
MA = DA.as_matrix()
MB = DB.as_matrix()
MM = np.zeros((len(MA),len(MA[0])*len(MB[0])))
Col = []
for i in range(len(MB[0])):
for j in range(len(MA[0])):
MM[:,i*len(MA[0])+j] = MA[:,j]*MB[:,i]
Col.append('1col_'+str(i+1)+'_2col_'+str(j+1))
return pd.DataFrame(MM,dtype=int,columns=Col)
def Test3(df_1, df_2):
df = pd.concat([df_1.mul(i[1], axis="index") for i in df_2.iteritems()], axis=1)
df.columns = ["_".join([i,j]) for j in df_2.columns for i in df_1.columns]
return df
def Test4(df_1,df_2):
pidx = np.indices((df_1.shape[1], df_2.shape[1])).reshape(2, -1)
lcol = pd.MultiIndex.from_product([df_1.columns, df_2.columns],
names=[df_1.columns.name, df_2.columns.name])
return pd.DataFrame(df_1.values[:, pidx[0]] * df_2.values[:, pidx[1]],
columns=lcol)
def jeanrjc_imp(df_1, df_2):
df = pd.concat([df_1.mul(??i[1], axis="index") for i in df_2.iteritems()], axis=1, keys=df_2.columns)
return df
Run Code Online (Sandbox Code Playgroud)
对不起,丑陋的代码,最后的情节很重要:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
df_1 = pd.DataFrame(np.random.randint(0, 2, (1000, 600)))
df_2 = pd.DataFrame(np.random.randint(0, 2, (1000, 600)))
df_1.columns = ["1col_"+str(i) for i in range(len(df_1.columns))]
df_2.columns = ["2col_"+str(i) for i in range(len(df_2.columns))]
resa = {}
resb = {}
resc = {}
for f, r in zip([Test2, Test3, Test4, jeanrjc_imp], ["T2", "T3", "T4", "T3bis"]):
resa[r] = []
resb[r] = []
resc[r] = []
for i in [5, 10, 30, 50, 150, 200]:
a = %timeit -o f(df_1.iloc[:,:i], df_2.iloc[:, :10])
b = %timeit -o f(df_1.iloc[:,:i], df_2.iloc[:, :50])
c = %timeit -o f(df_1.iloc[:,:i], df_2.iloc[:, :200])
resa[r].append(a.best)
resb[r].append(b.best)
resc[r].append(c.best)
X = [5, 10, 30, 50, 150, 200]
fig, ax = plt.subplots(1, 3, figsize=[16,5])
for j, (a, r) in enumerate(zip(ax, [resa, resb, resc])):
for i in r:
a.plot(X, r[i], label=i)
a.set_xlabel("df_1 columns #")
a.set_title("df_2 columns # = {}".format(["10", "50", "200"][j]))
ax[0].set_ylabel("time(s)")
plt.legend(loc=0)
plt.tight_layout()
Run Code Online (Sandbox Code Playgroud)
随着T3b <=> jeanrjc_imp.这比Test3快一点.
根据您的数据集大小,在Test4和Test3(b)之间选择正确的函数.鉴于OP的数据集,Test3或jeanrjc_imp应该是最快的,也是最短的写!
HTH
| 归档时间: |
|
| 查看次数: |
1088 次 |
| 最近记录: |