Question

Removing one field from a struct in polars

I want to remove one field from a struct, currently I set it up like this, but is there a simpler way to achieve this?

import polars as pl
import polars.selectors as cs

def remove_one_field(df: pl.DataFrame) -> pl.DataFrame:
    meta_data_columns = (df.select('meta_data')
                           .unnest('meta_data')
                           .select(cs.all() - cs.by_name('system_data')).columns)
    print(meta_data_columns)
    return (df.unnest('meta_data')
              .select(cs.all() - cs.by_name('system_data'))
              .with_columns(meta_data=pl.struct(meta_data_columns))
              .drop(meta_data_columns))

# Example usage
input_df = pl.DataFrame({
    "id": [1, 2],
    "meta_data": [{"system_data": "to_remove", "user_data": "keep"}, {"user_data": "keep_"}]
})
output_df = remove_one_field(input_df)
print(output_df)
['user_data']
shape: (2, 2)
┌─────┬───────────┐
│ id  ┆ meta_data │
│ --- ┆ ---       │
│ i64 ┆ struct[1] │
╞═════╪═══════════╡
│ 1   ┆ {"keep"}  │
│ 2   ┆ {"keep_"} │
└─────┴───────────┘

Something like select on fields within a struct?

 5  110  5
1 Jan 1970

Solution

 4

You can use struct.field() which can accept either list of strings or multiple string arguments. You know your DataFrame' schema() so you can easily create list of fields you want

fields = [c[0] for c in input_df.schema["meta_data"] if c[0] != "system_data"]

input_df.with_columns(
    meta_data = pl.struct(
        pl.col.meta_data.struct.field(fields)
    )
)

┌─────┬───────────┐
│ id  ┆ meta_data │
│ --- ┆ ---       │
│ i64 ┆ struct[1] │
╞═════╪═══════════╡
│ 1   ┆ {"keep"}  │
│ 2   ┆ {"keep_"} │
└─────┴───────────┘
2024-07-07
Roman Pekar

Solution

 0

Depending on how dynamically you need to do it, it could range from

# recreate the meta_data column, but only with the user_data field
input_df.with_columns(meta_data=pl.struct(pl.col("meta_data").struct["user_data"]))

which satisfies your example, to something like

def drop_struct_fields(
    df: pl.DataFrame,
    struct_column_name: str,
    struct_fields_to_drop: str | set,
) -> pl.DataFrame:
    if isinstance(struct_fields_to_drop, str):
        struct_fields_to_drop = {struct_fields_to_drop}
    # Get the struct fields from the column except those we want to drop
    output_struct_cols = (
        c[0] for c in df.schema[struct_column_name] if c[0] not in struct_fields_to_drop
    )

    # select only the struct columns we want to include in the output
    return df.with_columns(
        pl.struct(pl.col(struct_column_name).struct.field(*output_struct_cols)).alias(
            struct_column_name
        )
    )

drop_struct_fields(input_df, "meta_data", "system_data")
# or drop_struct_fields(input_df, "meta_data", {"system_data", "other_struct_field"})

which is much more dynamic and generally reusable.

Disclaimer: output_struct_cols inspired/stolen from Roman Pekar in other answer

To be honest, I don't think pl.Expr.struct.drop would go astray either as a feature request. There is already struct.with_fields which is like with_columns for structs. There is a request here for it on Github if you want to support it.

2024-07-07
Henry Harbeck