Question
Parsing formulas efficiently using regex and Polars
I am trying to parse a series of mathematical formulas and need to extract variable names efficiently using Polars in Python. Regex support in Polars seems to be limited, particularly with look-around assertions. Is there a simple, efficient way to parse symbols from formulas?
Here's the snippet of my code:
import re
import polars as pl
# Define the regex pattern
FORMULA_DECODER = r"\b[A-Za-z][A-Za-z_0-9_]*\b(?!\()"
# \b # Assert a word boundary to ensure matching at the beginning of a word
# [A-Za-z] # Match an uppercase or lowercase letter at the start
# [A-Za-z0-9_]* # Match following zero or more occurrences of valid characters (letters, digits, or underscores)
# \b # Assert a word boundary to ensure matching at the end of a word
# (?!\() # Negative lookahead to ensure the match is not followed by an open parenthesis (indicating a function)
# Sample formulas
formulas = ["3*sin(x1+x2)+A_0",
"ab*exp(2*x)"]
# expected result
pl.Series(formulas).map_elements(lambda formula: re.findall(FORMULA_DECODER, formula), return_dtype=pl.List(pl.String))
# Series: '' [list[str]]
# [
# ["x1", "x2", "A_0"]
# ["ab", "x"]
# ]
# Polars does not support this regex pattern
pl.Series(formulas).str.extract_all(FORMULA_DECODER)
# ComputeError: regex error: regex parse error:
# \b[A-Za-z][A-Za-z_0-9_]*\b(?!\()
# ^^^
# error: look-around, including look-ahead and look-behind, is not supported
Edit Here is a small benchmark:
import random
import string
import re
import polars as pl
def generate_symbol():
"""Generate random symbol of length 1-3."""
characters = string.ascii_lowercase + string.ascii_uppercase
return ''.join(random.sample(characters, random.randint(1, 3)))
def generate_formula():
"""Generate random formula with 2-5 unique symbols."""
op = ['+', '-', '*', '/']
return ''.join([generate_symbol()+random.choice(op) for _ in range(random.randint(2, 6))])[:-1]
def generate_formulas(num_formulas):
"""Generate random formulas."""
return [generate_formula() for _ in range(num_formulas)]
# Sample formulas
# formulas = ["3*sin(x1+x2)+(A_0+B)",
# "ab*exp(2*x)"]
def parse_baseline(formulas):
"""Baseline serves as performance reference. It will not detect function names."""
FORMULA_DECODER_NO_LOOKAHEAD = r"\b[A-Za-z][A-Za-z_0-9_]*\b\(?"
return pl.Series(formulas).str.extract_all(FORMULA_DECODER_NO_LOOKAHEAD)
def parse_lookahead(formulas):
FORMULA_DECODER = r"\b[A-Za-z][A-Za-z_0-9_]*\b(?!\()"
return pl.Series(formulas).map_elements(lambda formula: re.findall(FORMULA_DECODER, formula), return_dtype=pl.List(pl.String))
def parse_no_lookahead_and_filter(formulas):
FORMULA_DECODER_NO_LOOKAHEAD = r"\b[A-Za-z][A-Za-z_0-9_]*\b\(?"
return (
pl.Series(formulas)
.str.extract_all(FORMULA_DECODER_NO_LOOKAHEAD)
# filter for matches not containing an open parenthesis
.list.eval(pl.element().filter(~pl.element().str.contains("(", literal=True)))
)
formulas = generate_formulas(1000)
%timeit parse_lookahead(formulas)
%timeit parse_no_lookahead_and_filter(formulas)
%timeit parse_baseline(formulas)
# 10.7 ms ± 387 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
# 1.31 ms ± 76.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
# 708 μs ± 6.43 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
3 58
3