Question

How to vectorize groupby combination lists of two columns in Pandas Dataframe

I've a dataframe and need to group by two columns from all possible combinations of dataframe columns ['A','B','C','D','E','F','G']

import pandas as pd

d = {'A': [0,1,1,0,0,1,0,0],
     'B': [1,1,0,0,0,1,0,1],
     'C': [0,0,1,1,0,0,1,0],
     'D': [1,1,1,0,0,1,0,0],
     'E': [0,0,0,1,0,0,0,1],
     'F': [0,0,1,1,0,0,1,0],
     'G': [1,0,1,1,0,0,0,0],
     'Feature_1':[1,0,1,1,0,1,1,0],
     'Feature_2':[0,1,1,0,1,0,0,1]}

df = pd.DataFrame(d)
print(df)

   A  B  C  D  E  F  G  Feature_1  Feature_2
0  0  1  0  1  0  0  1          1          0
1  1  1  0  1  0  0  0          0          1
2  1  0  1  1  0  1  1          1          1
3  0  0  1  0  1  1  1          1          0
4  0  0  0  0  0  0  0          0          1
5  1  1  0  1  0  0  0          1          0
6  0  0  1  0  0  1  0          1          0
7  0  1  0  0  1  0  0          0          1

Need to do groupby all possible combinations of columns of length 2

is there a way to vectorize this process which takes long time if dataframe columns are relatively large :

The output expected as:

Col_1   Col_2   Col_1_value Col_2_value Feature_1_Sum   Feature_2_Sum
0   A       B       0           0           2               1
1   A       B       0           1           1               1
2   A       B       1           0           1               1
3   A       B       1           1           1               1
0   A       C       0           0           1               2
... ...     ...     ...         ...         ...             ...
3   E       G       1           1           1               0
0   F       G       0           0           1               3
1   F       G       0           1           1               0
2   F       G       1           0           1               0
3   F       G       1           1           2               1
77 rows × 6 columns

What I've done through looping :

    from itertools import combinations

    cols_list = ['A','B','C','D','E','F','G']

    i=0

    for comb in combinations(cols_list, 2):
        i=i+1

        comb_list = [comb[0], comb[1]]

        try:
            del df_gp
        except Exception as e:
            pass
    
        try:
            del df_comb
         except Exception as e:
            pass

        df_gp = (df.groupby(by=comb_list).agg(Feature_1_Sum=('Feature_1','sum'), Feature_2_Sum=  ('Feature_2','sum')).reset_index())
    
        df_gp['Col_1'] = comb[0]
        df_gp['Col_2'] = comb[1]
    
        df_gp['Col_1_value'] = df_gp[comb[0]]
        df_gp['Col_2_value'] = df_gp[comb[1]]
    
        df_comb = pd.DataFrame()

        cols = ['Col_1','Col_2','Col_1_value','Col_2_value','Feature_1_Sum','Feature_2_Sum']
        df_comb = df_gp[cols]
    
        if i==1:
            df_agg = df_comb
        else:
            df_agg = pd.concat([df_agg, df_comb])
               
    df_agg

4 76 4

1 Jan 1970

import pandas as pd import numpy as np from itertools import chain, combinations cols_list = ['A', 'B', 'C', 'D', 'E', 'F', 'G'] features = ['Feature_1','Feature_2'] agg_features = [f"{col}_sum" for col in features] combo = enumerate(cols_list) combo = zip(*combo) combo = zip(*combo) combo = combinations(combo, 2) combo = enumerate(combo) combo = (((*a,num),(*b,num)) for num, (a,b) in combo) combo = [*chain.from_iterable(combo)] combo = pd.MultiIndex.from_tuples(combo) # build two dataframes, one with cols_list, the other containing just the features # we blow the columns horizontally, dff = df.loc[:, 'A':'G'] columns = pd.MultiIndex.from_arrays([range(dff.columns.size), dff.columns]) dff.columns = columns dff=dff.reindex(combo.droplevel(-1),axis=1) feat1 = np.tile(['Feature_1','Feature_2'],len(combo)//2) feat2 = np.repeat(range(len(combo)//2),2) feat3 = [feat1,feat2] feat3 = pd.MultiIndex.from_arrays(feat3) features=(df .loc[:, features] .reindex(feat3.droplevel(1),axis=1) ) # reshape, using numpy and recombine into one dataframe # you can combine the numpy arrays # and create a DataFrame from the concatenated array f1=dff.to_numpy().reshape((-1,2),order='C') f1=pd.DataFrame(f1,columns=['col_value1','col2_value']) f2=dff.columns.get_level_values(1).to_numpy() f2=np.reshape(f2, (-1,2),order='C') f2=np.tile(f2, (len(df),1)) f2=pd.DataFrame(f2,columns=['col1','col2']) f3=features.to_numpy().reshape((-1,2),order='C') f3=pd.DataFrame(f3,columns=['Feature_1_sum','Feature_2_sum']) out=pd.concat([f2,f1,f3],axis=1) # groupby and compute result group_columns = ['col1','col2','col_value1','col2_value'] outcome = (out .groupby(group_columns,sort=False,as_index=False) .sum() # you can ignore this .sort_values(group_columns,ignore_index=True) ) print(outcome) col1 col2 col_value1 col2_value Feature_1_sum Feature_2_sum 0 A B 0 0 2 1 1 A B 0 1 1 1 2 A B 1 0 1 1 3 A B 1 1 1 1 4 A C 0 0 1 2 .. ... ... ... ... ... ... 72 E G 1 1 1 0 73 F G 0 0 1 3 74 F G 0 1 1 0 75 F G 1 0 1 0 76 F G 1 1 2 1 [77 rows x 6 columns]