I have a project of a symbolic regression (here on GitHub). I use first curve_fit of scipy to find parameters with linear expressions (see fit function line 85 of file sr.py) but for non linear expressions (commented tests in file tests/test_sr.py), it is not efficient. To simplify things, I want to add constraints on parameters like to be integers. I tried DEAP but without success (maybe wrong usage). Is there any way to achieve this goal?
For example, the following code fails (the value of parameters should be found in one iteration [those found are not correct], id sin and exp are applied first, then operator mul between available combinations). In this case, the symbolic expression to optimize in parameters has the following expression: i*(a*sin(b*x+c)+d)*(e*exp(f*x+g)+h)+j (up to 10 parameters to found).
import numpy as np
import sr
import sympy
def test_2():
#sin(x)*exp(x)
model = sr.SR(niterations = 1,
unary_operators = {"sin": (sympy.sin, np.sin),
"exp": (sympy.exp, np.exp)},
binary_operators = {"*": (operator.mul, operator.mul)},
discrete_param_values = ["(-5, 5)"],
foundBreak = True,
maxfev = 200)
n = 100
xmin = -5
xmax = 10
x = (xmax - xmin) * np.random.rand(n) + xmin
y = np.sin(x) * np.exp(x)
model.predict([x], y, ["x"])
assert(len(model.bestExpressions) == 1)
assert(model.bestExpressions[0][0] == sympy.sympify("sin(x)*exp(x)"))
My implementation of fit function tries several random curve_fit with round operation (for discrete values):
def fit(func, value_vars, y, p0, loss_func, eps, maxfev, discrete_values = []):
if (len(discrete_values) == 0):
try:
value_params, _ = curve_fit(func, value_vars, y, p0 = p0, maxfev = maxfev)
except RuntimeError as e:
print(e)
return p0
return value_params
try:
value_params, _ = curve_fit(func, value_vars, y, p0 = p0, maxfev = maxfev)
except RuntimeError as e:
print(e)
return p0
best_x = random_discrete_values(len(p0), discrete_values)
try:
best_loss = loss_func(func(value_vars, *best_x), y)
x = best_x
for i in range(0, maxfev):
try:
value_params, _ = curve_fit(func, value_vars, y, p0 = random_discrete_values(len(p0), discrete_values), maxfev = 10 * maxfev)
x = round_discrete_values(value_params, discrete_values)
loss = loss_func(func(value_vars, *x), y)
if (loss < best_loss and any(x)):
best_loss = loss
best_x = x
if (best_loss < eps):
break
except RuntimeError:
pass
except ValueError:
pass
except OverflowError:
pass
except ValueError:
pass
except OverflowError:
pass
return best_x
Here is a MCVE with Optuna, it fails too:
import math
import numpy as np
import random
import sympy
import optuna
def range_discrete_values(discrete_values):
type_, min_, max_ = int, math.inf, -math.inf
for value in discrete_values:
if (type(value) is float):
type_ = float
min_ = min(min_, value)
max_ = max(max_, value)
elif (type(value) == str):
s = value
a, b = [float(x) for x in s[1:-1].split(",")]
assert(a <= b)
if (s[0] == "(" or s[0] == ")"):
a, b = int(a), int(b)
elif (s[0] == "[" or s[0] == "]"):
type_ = float
min_ = min(min_, a)
max_ = max(max_, b)
return type_, min_, max_
def random_discrete_values(n, discrete_values):
values = []
for i in range(0, n):
value = discrete_values[random.randint(0, len(discrete_values) - 1)]
if (type(value) == str):
s = value
a, b = [float(x) for x in s[1:-1].split(",")]
assert(a <= b)
if (s[0] == "(" or s[0] == ")"):
a, b = int(a), int(b)
if (s[0] == ")"):
a += 1
if (s[-1] == "("):
b -=1
value = random.randint(a, b)
elif (s[0] == "[" or s[0] == "]"):
value = (b - a) * random.random() + a
values.append(value)
return values
def round_discrete_values(values, discrete_values):
for i in range(0, len(values)):
best_diff = math.inf
best_value = None
for v in discrete_values:
if (type(v) == str):
s = v
a, b = [float(x) for x in s[1:-1].split(",")]
assert(a <= b)
if (s[0] == "(" or s[0] == ")"):
a, b = int(a), int(b)
if (s[0] == ")"):
a += 1
if (s[-1] == "("):
b -=1
if (a <= values[i] and values[i] <= b):
best_diff = 0
best_value = int(round(values[i]))
break
else:
if (abs(values[i] - a) < best_diff):
best_diff = abs(values[i] - a)
best_value = a
if (abs(values[i] - b) < best_diff):
best_diff = abs(values[i] - b)
best_value = b
else:
if (abs(values[i] - v) < best_diff):
best_diff = abs(values[i] - v)
best_value = v
values[i] = best_value
return values
def fit(func, value_vars, y, p0, loss_func, eps, maxfev, discrete_values = []):
if (len(discrete_values) == 0):
try:
value_params, _ = curve_fit(func, value_vars, y, p0 = p0, maxfev = maxfev)
except RuntimeError as e:
print(e)
return p0
return value_params
type_, min_, max_ = range_discrete_values(discrete_values)
def float_objective(trial):
x = [trial.suggest_float('x' + str(i), min_, max_) for i in range(0, len(p0))]
return loss_func(func(value_vars, *x), y)
def int_objective(trial):
x = [trial.suggest_int('x' + str(i), min_, max_) for i in range(0, len(p0))]
return loss_func(func(value_vars, *x), y)
study = optuna.create_study()
if (type_ == int):
study.optimize(int_objective, n_trials = maxfev)
else:
study.optimize(float_objective, n_trials = maxfev)
value_params = list(study.best_params.values())
value_params = round_discrete_values(value_params, discrete_values)
return value_params
def mse_loss(x, y):
return np.sum((x - y) ** 2)
def model(x, a, b, c, d, e, f, g, h, i, j):
return i * (a * np.sin(b * x + c) + d) * (e * np.exp(f * x + g) + h) + j
def sym_model(x, a, b, c, d, e, f, g, h, i, j):
return i * (a * sympy.sin(b * x + c) + d) * (e * sympy.exp(f * x + g) + h) + j
def test_2():
#sin(x)*exp(x)
n = 100
xmin = -5
xmax = 10
x = (xmax - xmin) * np.random.rand(n) + xmin
y = np.sin(x) * np.exp(x)
p0 = random_discrete_values(10, ["(-5, 5)"])
a, b, c, d, e, f, g, h, i, j = fit(model, x, y, p0, mse_loss, 1e-6, 100, ["(-5, 5)"])
assert(sym_model(sympy.Symbol("x"), a, b, c, d, e, f, g, h, i, j) == sympy.sympify("exp(x)*sin(x)"))
if __name__ == '__main__':
test_2()
I tried with Nevergrad but it fails too. I don't know if existing symbolic regression tools work on this case and if yes, how they do (with PySR, I must use complexity_of_constants=100 to find the correct result).
I think that the most difficult case is: 1/sqrt(2*pi)*exp(-x**2/2), it has three real constants 1/sqrt(2*pi), -1/2 and 0. Ideally, the symbolic expression to explore is a*exp(b*x**2+c*x+d)+f but to find the correct parameters from data is another subject.
I don't know if compute minimum differences and do a gradient descent could be an alternative to brute force or genetic algorithms. I saw pybnb but I don't arrive to set up a minimal example with my case, if it is the solution, I will take.
I tried a ML approach with torch, it works for [0, 1] range but not for [-2, 2], so it is not scalable.