Question

How to vectorize groupby combination lists of two columns in Pandas Dataframe

I've a dataframe and need to group by two columns from all possible combinations of dataframe columns ['A','B','C','D','E','F','G']

import pandas as pd

d = {'A': [0,1,1,0,0,1,0,0],
     'B': [1,1,0,0,0,1,0,1],
     'C': [0,0,1,1,0,0,1,0],
     'D': [1,1,1,0,0,1,0,0],
     'E': [0,0,0,1,0,0,0,1],
     'F': [0,0,1,1,0,0,1,0],
     'G': [1,0,1,1,0,0,0,0],
     'Feature_1':[1,0,1,1,0,1,1,0],
     'Feature_2':[0,1,1,0,1,0,0,1]}

df = pd.DataFrame(d)
print(df)

   A  B  C  D  E  F  G  Feature_1  Feature_2
0  0  1  0  1  0  0  1          1          0
1  1  1  0  1  0  0  0          0          1
2  1  0  1  1  0  1  1          1          1
3  0  0  1  0  1  1  1          1          0
4  0  0  0  0  0  0  0          0          1
5  1  1  0  1  0  0  0          1          0
6  0  0  1  0  0  1  0          1          0
7  0  1  0  0  1  0  0          0          1

Need to do groupby all possible combinations of columns of length 2

is there a way to vectorize this process which takes long time if dataframe columns are relatively large :

The output expected as:

Col_1   Col_2   Col_1_value Col_2_value Feature_1_Sum   Feature_2_Sum
0   A       B       0           0           2               1
1   A       B       0           1           1               1
2   A       B       1           0           1               1
3   A       B       1           1           1               1
0   A       C       0           0           1               2
... ...     ...     ...         ...         ...             ...
3   E       G       1           1           1               0
0   F       G       0           0           1               3
1   F       G       0           1           1               0
2   F       G       1           0           1               0
3   F       G       1           1           2               1
77 rows × 6 columns

What I've done through looping :

    from itertools import combinations

    cols_list = ['A','B','C','D','E','F','G']

    i=0

    for comb in combinations(cols_list, 2):
        i=i+1

        comb_list = [comb[0], comb[1]]

        try:
            del df_gp
        except Exception as e:
            pass
    
        try:
            del df_comb
         except Exception as e:
            pass

        df_gp = (df.groupby(by=comb_list).agg(Feature_1_Sum=('Feature_1','sum'), Feature_2_Sum=  ('Feature_2','sum')).reset_index())
    
        df_gp['Col_1'] = comb[0]
        df_gp['Col_2'] = comb[1]
    
        df_gp['Col_1_value'] = df_gp[comb[0]]
        df_gp['Col_2_value'] = df_gp[comb[1]]
    
        df_comb = pd.DataFrame()

        cols = ['Col_1','Col_2','Col_1_value','Col_2_value','Feature_1_Sum','Feature_2_Sum']
        df_comb = df_gp[cols]
    
        if i==1:
            df_agg = df_comb
        else:
            df_agg = pd.concat([df_agg, df_comb])
               
    df_agg                        
 4  76  4
1 Jan 1970

Solution

 0

This relies on numpy and pandas - all the columns are built, and then groupby is called once. itertools.combinations does generate a lot of columns though, so depending on the size of your data/computer memory, you may get OOM error.

import pandas as pd
import numpy as np
from itertools import chain, combinations

cols_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
features = ['Feature_1','Feature_2']
agg_features = [f"{col}_sum" for col in features]
combo = enumerate(cols_list)
combo = zip(*combo)
combo = zip(*combo)
combo = combinations(combo, 2)
combo = enumerate(combo)
combo = (((*a,num),(*b,num)) for num, (a,b) in combo)
combo = [*chain.from_iterable(combo)]
combo = pd.MultiIndex.from_tuples(combo)

# build two dataframes, one with cols_list, the other containing just the features
# we blow the columns horizontally, 
dff = df.loc[:, 'A':'G']
columns = pd.MultiIndex.from_arrays([range(dff.columns.size), dff.columns])
dff.columns = columns
dff=dff.reindex(combo.droplevel(-1),axis=1)

feat1 = np.tile(['Feature_1','Feature_2'],len(combo)//2)
feat2 = np.repeat(range(len(combo)//2),2)
feat3 = [feat1,feat2]
feat3 = pd.MultiIndex.from_arrays(feat3)
features=(df
         .loc[:, features]
         .reindex(feat3.droplevel(1),axis=1)
         )

# reshape, using numpy and recombine into one dataframe
# you can combine the numpy arrays
# and create a DataFrame from the concatenated array
f1=dff.to_numpy().reshape((-1,2),order='C')
f1=pd.DataFrame(f1,columns=['col_value1','col2_value'])
f2=dff.columns.get_level_values(1).to_numpy()
f2=np.reshape(f2, (-1,2),order='C')
f2=np.tile(f2, (len(df),1))
f2=pd.DataFrame(f2,columns=['col1','col2'])
f3=features.to_numpy().reshape((-1,2),order='C')
f3=pd.DataFrame(f3,columns=['Feature_1_sum','Feature_2_sum'])
out=pd.concat([f2,f1,f3],axis=1)

# groupby and compute result
group_columns = ['col1','col2','col_value1','col2_value']
outcome = (out
          .groupby(group_columns,sort=False,as_index=False)
          .sum()
          # you can ignore this
          .sort_values(group_columns,ignore_index=True)
          )

print(outcome)
   col1 col2  col_value1  col2_value  Feature_1_sum  Feature_2_sum
0     A    B           0           0              2              1
1     A    B           0           1              1              1
2     A    B           1           0              1              1
3     A    B           1           1              1              1
4     A    C           0           0              1              2
..  ...  ...         ...         ...            ...            ...
72    E    G           1           1              1              0
73    F    G           0           0              1              3
74    F    G           0           1              1              0
75    F    G           1           0              1              0
76    F    G           1           1              2              1

[77 rows x 6 columns]
2024-07-10
sammywemmy