Symbolic Regression vs. Neural Networks on Multiscale L96#
To recap from the previous notebooks, Lorenz (1996) describes a “two time-scale” model in two equations which are:
This model can be visualized as follows:
import os
import time
from IPython.display import display, SVG
import gplearn
import gplearn.genetic
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydot
import pysindy as ps
import torch
import torch.nn.functional as F
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset
from L96_model import L96
# Ensuring reproducibility
np.random.seed(14)
torch.manual_seed(14);
Dataset Generation#
Let’s generate some data using the same conditions as in the previous notebooks Introduction to Neural Networks and Using Neural Networks for L96 Parameterization.
Obtaining the Ground Truth#
time_steps = 20000
forcing, dt, T = 18, 0.01, 0.01 * time_steps
W = L96(8, 32, F=forcing)
X_true, _, _, subgrid_tend = W.run(dt, T, store=True, return_coupling=True)
X_true, subgrid_tend = X_true.astype(np.float32), subgrid_tend.astype(np.float32)
print(f"X_true shape: {X_true.shape}")
print(f"subgrid_tend shape: {subgrid_tend.shape}")
X_true shape: (20001, 8)
subgrid_tend shape: (20001, 8)
In this case, although we have 8 different dimensions in \(X\), each is derived in the exact same way from neighboring values of \(X\) (and local subdimensions \(Y\)). Therefore, we can actually reduce this problem from an 8-to-8D regression problem down to an 8-to-1D regression problem:
Dx = X_true.shape[1]
inputs = np.vstack([X_true[:, range(-i, Dx - i)] for i in range(Dx)])
targets = np.hstack([subgrid_tend[:, -i] for i in range(Dx)])
print(f"inputs shape: {inputs.shape}")
print(f"targets shape: {targets.shape}")
inputs shape: (160008, 8)
targets shape: (160008,)
Show code cell source
plt.figure(figsize=(16, 4), dpi=150)
plt.title("Multiscale L96 trajectory", fontsize=16)
for i in range(Dx):
plt.plot(inputs[:, i][:500], label=r"$x" + str(i) + "$")
plt.legend(ncol=2, fontsize=14)
plt.xlabel("Time", fontsize=16)
plt.ylabel("Value", fontsize=16)
plt.show()
Show code cell source
fig = plt.figure(figsize=(12, 6), dpi=150)
fig.suptitle("2D histograms of features vs. forcing", fontsize=16)
for i in range(Dx):
plt.subplot(2, 4, i + 1)
plt.hist2d(inputs[:, i], targets, bins=50)
plt.xlabel(f"$x_{i}$", fontsize=16)
plt.xticks(np.arange(-6, 13, 2))
plt.xlim(-8, 14)
if i % 4 == 0:
plt.ylabel("Forcing", fontsize=16)
plt.tight_layout()
Looking at these marginal histograms of each large-scale variable vs. the subgrid forcing, we can see there’s a mostly linear relationship between \(x_0\) and its forcing, suggesting that the subgrid terms tend to force small values higher and large values lower. This suggests that an extremely simple linear model only containing \(x_0\) with negative slope will do reasonably well. However, the relationship isn’t completely linear, and also contains a bifurcation at larger \(x_0\).
For the other features, the relationships are less straightforward, but there’s clearly high mutual information.
Split the Data into Training and Test Sets#
data_idxs = np.arange(len(inputs))
np.random.shuffle(data_idxs)
num_train_samples = int(len(inputs) * 0.75)
train_idxs = data_idxs[:num_train_samples]
test_idxs = data_idxs[num_train_samples:]
Build the Dataset and the Dataloaders#
BATCH_SIZE = 1024
# Training Data
train_dataset = TensorDataset(
torch.tensor(inputs[train_idxs]), torch.tensor(targets[train_idxs])
)
loader_train = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
# Test Data
test_dataset = TensorDataset(
torch.tensor(inputs[test_idxs]), torch.tensor(targets[test_idxs])
)
loader_test = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=True)
Creating and Training the Neural Network#
Now let’s create a neural network and train it on the dataset we generated above.
Define Functions to Train and Evaluate the Model#
Note
The code in this section is taken from the notebook Introduction to Neural Networks.
Show code cell content
def train_model(network, criterion, loader, optimizer):
"""Train the network for one epoch"""
network.train()
train_loss = 0
for batch_x, batch_y in loader:
# Get predictions
if len(batch_x.shape) == 1:
# This if block is needed to add a dummy dimension if our inputs are 1D
# (where each number is a different sample)
prediction = torch.squeeze(network(torch.unsqueeze(batch_x, 1)))
else:
prediction = network(batch_x)
# Compute the loss
loss = criterion(prediction, batch_y)
train_loss += loss.item()
# Clear the gradients
optimizer.zero_grad()
# Backpropagation to compute the gradients and update the weights
loss.backward()
optimizer.step()
return train_loss / len(loader)
def test_model(network, criterion, loader):
"""Test the network"""
network.eval() # Evaluation mode (important when having dropout layers)
test_loss = 0
with torch.no_grad():
for batch_x, batch_y in loader:
# Get predictions
if len(batch_x.shape) == 1:
# This if block is needed to add a dummy dimension if our inputs are 1D
# (where each number is a different sample)
prediction = torch.squeeze(network(torch.unsqueeze(batch_x, 1)))
else:
prediction = network(batch_x)
# Compute the loss
loss = criterion(prediction, batch_y)
test_loss += loss.item()
# Get an average loss for the entire dataset
test_loss /= len(loader)
return test_loss
def fit_model(network, criterion, optimizer, train_loader, val_loader, n_epochs):
"""Train and validate the network"""
train_losses, val_losses = [], []
start_time = time.time()
for epoch in range(1, n_epochs + 1):
train_loss = train_model(network, criterion, train_loader, optimizer)
val_loss = test_model(network, criterion, val_loader)
train_losses.append(train_loss)
val_losses.append(val_loss)
end_time = time.time()
print(f"Training completed in {int(end_time - start_time)} seconds.")
return train_losses, val_losses
Create the Neural Network#
class Net_ANN(nn.Module):
def __init__(self):
super(Net_ANN, self).__init__()
self.linear1 = nn.Linear(8, 16)
self.linear2 = nn.Linear(16, 16)
self.linear3 = nn.Linear(16, 1)
def forward(self, x):
x = F.relu(self.linear1(x))
x = F.relu(self.linear2(x))
x = self.linear3(x)[:, 0]
return x
Train the Network#
nn_3l = Net_ANN()
n_epochs = 20
criterion = torch.nn.MSELoss()
optimizer = optim.Adam(nn_3l.parameters(), lr=0.01)
train_loss, validation_loss = fit_model(
nn_3l, criterion, optimizer, loader_train, loader_test, n_epochs
)
Training completed in 17 seconds.
Visualize Results#
plt.figure(dpi=150)
plt.plot(train_loss, "b", label="Training loss")
plt.plot(validation_loss, "r", label="Validation loss")
plt.legend();
preds_nn = nn_3l(torch.from_numpy(inputs[test_idxs])).detach().numpy()
print("R2 Score:", r2_score(preds_nn, targets[test_idxs]))
R2 Score: 0.9223949649031744
The neural network gets an \(R^2\) of ~ 0.916, which seems pretty good given the non-determinism of the problem.
Comparing with Linear Regression#
linear_reg = LinearRegression()
linear_reg.fit(inputs[train_idxs], targets[train_idxs])
print("R2 Score:", linear_reg.score(inputs[test_idxs], targets[test_idxs]))
R2 Score: 0.8160184487760795
Linear regression also does fairly well, though about 0.1 lower in terms of \(R^2\). Lets look at its coefficients:
plt.figure(dpi=150)
plt.bar(range(8), linear_reg.coef_)
plt.xticks(range(8), [f"x{i}" for i in range(8)], fontsize=12)
plt.ylabel("Coefficient value", fontsize=12)
plt.show()
This matches the 2D histogram plots from earlier that showed a negative linear relationship between forcing and \(x_0\).
plt.figure(figsize=(15, 6), dpi=150)
plt.title("Neural Network vs. Linear Regression", fontsize=16)
preds_nn = nn_3l(torch.from_numpy(inputs)).detach().numpy()
preds_lr = linear_reg.predict(inputs)
random_idx = np.random.choice(len(targets - 1000))
random_slice = slice(random_idx, random_idx + 1000)
plt.plot(preds_nn[random_slice], label="NN Predicted values")
plt.plot(preds_lr[random_slice], label="LR Predicted values")
plt.plot(
targets[random_slice], label="True values", ls="--", color="black", alpha=0.667
)
plt.xlabel(f"Time (starting at idx {random_idx})", fontsize=16)
plt.ylabel("Subgrid forcing", fontsize=16)
plt.legend(fontsize=14)
plt.show()
Symbolic regression with genetic programming#
Let’s try using one of the genetic programming libraries to discover a parameterization:
symbolic_reg = gplearn.genetic.SymbolicRegressor(
population_size=5000,
generations=25,
p_crossover=0.7,
p_subtree_mutation=0.1,
p_hoist_mutation=0.05,
p_point_mutation=0.1,
max_samples=0.9,
verbose=1,
parsimony_coefficient=0.001,
const_range=(-2, 2),
function_set=("add", "mul", "sub", "div", "sin", "log"),
)
symbolic_reg.fit(inputs[train_idxs][:20000], targets[train_idxs][:20000])
| Population Average | Best Individual |
---- ------------------------- ------------------------------------------ ----------
Gen Length Fitness Length Fitness OOB Fitness Time Left
0 15.09 2250.77 3 1.64883 1.66775 3.16m
1 10.32 6.53204 5 1.62545 1.63651 3.08m
2 12.32 16.9512 5 1.62268 1.66148 2.98m
3 7.04 5.26195 20 1.57546 1.6044 2.21m
4 5.07 9.58891 24 1.55363 1.55098 1.81m
5 5.79 5.96614 21 1.51949 1.49863 1.89m
6 9.63 4.65507 17 1.50515 1.50598 2.65m
7 16.11 2.78142 28 1.47296 1.45776 3.65m
8 19.77 2.69051 28 1.4555 1.46709 3.93m
9 21.42 2.80303 41 1.45297 1.48639 3.87m
10 22.63 2.61995 37 1.44204 1.47566 3.95m
11 23.78 2.29202 41 1.42739 1.4853 3.92m
12 24.47 6.6118 8 1.41676 1.43687 3.62m
13 25.15 2.55741 22 1.4201 1.47976 3.29m
14 25.67 2.27797 28 1.41595 1.43681 2.97m
15 24.77 2.45272 42 1.35228 1.38372 2.59m
16 20.61 3.97112 23 1.34902 1.34875 2.04m
17 12.16 3.66793 24 1.33076 1.35901 1.30m
18 14.17 3.62681 24 1.32406 1.35392 1.30m
19 27.37 2.86559 29 1.31132 1.33767 1.83m
20 25.82 2.40423 31 1.30332 1.33496 1.42m
21 23.41 2.45246 38 1.30026 1.32775 59.28s
22 23.45 2.57487 38 1.29615 1.31448 38.39s
23 24.97 2.56582 33 1.29189 1.30837 20.07s
24 24.11 2.33584 37 1.29417 1.32678 0.00s
sub(sin(mul(-0.447, X0)), sub(X0, sin(sub(-1.107, log(add(X2, sub(sub(sub(-1.107, log(add(X2, sub(sin(sub(-1.107, log(mul(-0.447, X0)))), sub(X0, -0.220))))), X0), log(sin(mul(-0.447, X0))))))))))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
sub(sin(mul(-0.447, X0)), sub(X0, sin(sub(-1.107, log(add(X2, sub(sub(sub(-1.107, log(add(X2, sub(sin(sub(-1.107, log(mul(-0.447, X0)))), sub(X0, -0.220))))), X0), log(sin(mul(-0.447, X0))))))))))
print(symbolic_reg._program)
sub(sin(mul(-0.447, X0)), sub(X0, sin(sub(-1.107, log(add(X2, sub(sub(sub(-1.107, log(add(X2, sub(sin(sub(-1.107, log(mul(-0.447, X0)))), sub(X0, -0.220))))), X0), log(sin(mul(-0.447, X0))))))))))
print(
"R2 Score:", r2_score(symbolic_reg.predict(inputs[test_idxs]), targets[test_idxs])
)
R2 Score: 0.8499308040527297
While the program does slightly better than linear regression, it’s extremely complicated (graph visualization below for slightly easier inspection):
# Build the graph
graph = pydot.graph_from_dot_data(symbolic_reg._program.export_graphviz())[0]
graph.write_svg("figs/symbolic_regression.svg")
# Display the graph
display(SVG("figs/symbolic_regression.svg"))
Explore the Pareto frontier#
We can explore the Pareto frontier of the final population of programs as well, to see if there’s a simpler program that still does well
df = pd.DataFrame.from_dict(
[
dict(
idx=i,
expr=str(p),
fitness=p.fitness_,
length=p.length_,
r2_score=r2_score(p.execute(inputs[test_idxs]), targets[test_idxs]),
)
for i, p in enumerate(symbolic_reg._programs[-1])
]
)
pareto_frontier = []
used_programs = set()
for l in range(1, 40):
programs = df[df.length <= l]
best = programs.r2_score.argmax()
idx = programs.idx.values[best]
r2 = programs.r2_score.values[best]
program = symbolic_reg._programs[-1][idx]
if program not in used_programs:
pareto_frontier.append(dict(length=l, r2=r2, expr=str(program)))
used_programs.add(program)
pd.set_option("max_colwidth", 200)
pareto_frontier = pd.DataFrame.from_dict(pareto_frontier)
pareto_frontier
length | r2 | expr | |
---|---|---|---|
0 | 1 | 0.000000 | -0.220 |
1 | 2 | 0.000000 | sin(-1.107) |
2 | 3 | 0.787718 | sub(-0.220, X0) |
3 | 8 | 0.837378 | sub(sin(mul(-0.447, X0)), sub(X0, -0.220)) |
4 | 11 | 0.841150 | sub(sin(mul(-0.447, X0)), sub(X0, sin(mul(-0.447, X0)))) |
5 | 12 | 0.842545 | sub(sin(sin(mul(-0.447, X0))), sub(X0, sin(mul(-0.447, X0)))) |
6 | 13 | 0.843272 | sub(sin(sin(mul(-0.447, X0))), sub(X0, sin(sin(mul(-0.447, X0))))) |
7 | 16 | 0.854240 | sub(sin(mul(-0.447, X0)), sub(X0, sin(sub(-1.107, log(add(X2, div(X0, -1.107))))))) |
8 | 17 | 0.855131 | sub(sin(sub(-1.107, log(add(X2, sub(sin(X0), X0))))), sub(X0, sin(mul(-0.447, X0)))) |
9 | 19 | 0.856080 | sub(sin(sub(-1.107, log(add(X2, sub(sub(sin(X0), X0), -1.107))))), sub(X0, sin(mul(-0.447, X0)))) |
10 | 21 | 0.856531 | sub(sin(sub(-1.107, log(add(X2, sub(sub(-0.447, X0), log(sub(X0, -0.220))))))), sub(X0, sin(mul(-0.307, X0)))) |
11 | 23 | 0.856728 | sub(sin(sub(-1.107, log(add(X2, sub(sub(sin(X0), X0), log(sin(mul(-0.447, X0)))))))), sub(X0, sin(mul(-0.447, X0)))) |
12 | 35 | 0.856754 | sub(sin(sub(-1.107, log(add(X2, sub(sin(sub(-1.107, log(add(X2, sub(sub(sin(-0.605), X0), log(sub(X0, -0.220))))))), sub(X0, sin(mul(-0.447, X0)))))))), sub(X0, sin(mul(-0.447, X0)))) |
13 | 39 | 0.857035 | sub(sin(mul(-0.447, X0)), sub(X0, sin(sub(-1.107, log(add(X2, sub(sub(sin(sub(-1.107, log(add(X2, sub(sub(-0.605, X0), log(sub(-0.447, X0))))))), sub(X0, sin(mul(-0.447, X0)))), log(mul(-0.447, X0... |
Unfortunately, nothing really jumps out here as both simple and performant.
Symbolic Regression with Sparse Linear Regression and Polynomial Features#
We can also apply sparse regression methods to this problem with polynomial features
def learned_expr(coef, names, cutoff=1e-3):
coef_idx = np.argwhere(np.abs(coef) > cutoff)[:, 0]
sort_order = reversed(np.argsort(np.abs(coef[coef_idx])))
return " + ".join(
[
f"{coef[coef_idx[i]]:.4f}*{names[coef_idx[i]].replace(' ','*')}"
for i in sort_order
]
)
pf = PolynomialFeatures(2)
feats = pf.fit_transform(inputs)
names = pf.get_feature_names_out([f"x_{i}" for i in range(8)])
Non-sparse linear regression#
linear_reg = LinearRegression()
linear_reg.fit(feats[train_idxs], targets[train_idxs])
learned_expr(linear_reg.coef_, names)
'-0.6913*x_0 + 0.4333*x_1 + 0.3393*x_7 + 0.2112*x_5 + 0.1420*x_6 + 0.1118*x_2 + -0.0399*x_0*x_1 + -0.0333*x_0*x_2 + -0.0323*x_6*x_7 + -0.0288*x_0*x_7 + -0.0283*x_1*x_2 + -0.0248*x_1*x_5 + -0.0242*x_3 + -0.0219*x_1*x_6 + -0.0209*x_5*x_7 + 0.0201*x_0^2 + -0.0185*x_1*x_4 + -0.0135*x_0*x_5 + 0.0124*x_0*x_3 + 0.0107*x_4 + 0.0099*x_1*x_3 + 0.0090*x_2*x_5 + -0.0088*x_4*x_7 + -0.0081*x_5^2 + -0.0076*x_2*x_3 + -0.0074*x_2*x_7 + 0.0071*x_3*x_4 + 0.0071*x_4*x_5 + -0.0069*x_3*x_5 + 0.0061*x_3*x_6 + 0.0059*x_1*x_7 + -0.0057*x_7^2 + 0.0050*x_3^2 + -0.0048*x_0*x_4 + -0.0045*x_1^2 + -0.0036*x_0*x_6 + -0.0036*x_3*x_7 + 0.0034*x_2^2 + 0.0024*x_4*x_6 + 0.0023*x_2*x_4 + 0.0015*x_2*x_6 + -0.0011*x_5*x_6'
print("R2 Score:", linear_reg.score(feats[test_idxs], targets[test_idxs]))
R2 Score: 0.8872654640415502
Adding polynomial features brings our \(R^2\) much closer to that of the neural network, but we have a ton of terms. Let’s try making the regression sparser.
Sequentially thresholded least squares#
stlsq = ps.optimizers.stlsq.STLSQ(alpha=0.001, threshold=0.01)
stlsq.fit(feats[train_idxs], targets[train_idxs])
learned_expr(stlsq.coef_[0], names)
'-3.7208*1 + -0.7842*x_0 + 0.3574*x_1 + 0.2431*x_7 + 0.1900*x_2 + 0.1778*x_5 + 0.1522*x_6 + -0.0390*x_0*x_1 + -0.0356*x_0*x_2 + -0.0335*x_6*x_7 + 0.0267*x_0^2 + -0.0265*x_1*x_5 + -0.0257*x_0*x_7 + -0.0227*x_1*x_2 + 0.0189*x_4 + -0.0164*x_1*x_6 + -0.0156*x_5*x_7 + 0.0145*x_3 + -0.0120*x_0*x_5 + 0.0110*x_0*x_3'
stlsq.score(feats[test_idxs], targets[test_idxs])
0.882294259449451
Looks like we can get a much sparser expression without sacrificing too much performance, though if we increase the threshold slightly, we will see a trade-off:
stlsq2 = ps.optimizers.stlsq.STLSQ(alpha=0.001, threshold=0.02)
stlsq2.fit(feats[train_idxs], targets[train_idxs])
learned_expr(stlsq2.coef_[0], names)
'-1.5860*1 + -0.9857*x_0 + 0.1324*x_2 + 0.0728*x_3 + -0.0725*x_7 + -0.0436*x_1 + -0.0322*x_0*x_2 + 0.0307*x_0^2'
stlsq2.score(feats[test_idxs], targets[test_idxs])
0.861645175881121
This is much simpler, but less performant. It still strongly outperformed the particluar genetic programming method we used, though. Let’s visualize some of the predictions:
plt.figure(figsize=(15, 6), dpi=150)
plt.title("Neural network vs. quadratic regressions", fontsize=16)
preds_nn = nn_3l(torch.from_numpy(inputs)).detach().numpy()
preds_lr_quad = linear_reg.predict(feats)
preds_st_quad = stlsq.predict(feats)
preds_st_quad2 = stlsq2.predict(feats)
random_idx = np.random.choice(len(targets - 1000))
random_slice = slice(random_idx, random_idx + 1000)
plt.plot(preds_nn[random_slice], label="NN predicted values")
plt.plot(preds_lr_quad[random_slice], label="LR + quadratic")
plt.plot(preds_st_quad[random_slice], label=r"STLSQ + quadratic, threshold=$0.01$")
plt.plot(preds_st_quad2[random_slice], label=r"STLSQ + quadratic, threshold=$0.02$")
plt.plot(
targets[random_slice], label="True values", ls="--", color="black", alpha=0.667
)
plt.xlabel(f"Time (starting at idx {random_idx})", fontsize=16)
plt.ylabel("Subgrid forcing", fontsize=16)
plt.legend(fontsize=14)
plt.show()
Add threshold features#
The 2D histogram of \(x_0\) looks like it might be potentially modelable by piecewise linear regression:
plt.figure(figsize=(4, 4), dpi=150)
plt.hist2d(inputs[:, 0], targets, bins=50)
plt.xlim(-6, 14)
plt.xlabel("$x_0$", fontsize=16)
plt.ylabel("Forcing", fontsize=16)
plt.show()
Sparse regression#
Let’s see if sparse regression will discover that if we provide it with thresholding features and cross-terms:
x0_thresholds = np.array(
[np.ones_like(inputs[:, 0])] + [inputs[:, 0] > i for i in range(6)]
)
x0_threshold_names = [""] + [f"*(x0>{t})" for t in range(6)]
feats2 = np.array(
[
feats[:, i] * x0_thresholds[j]
for i in range(feats.shape[1])
for j in range(len(x0_thresholds))
]
).T
names2 = [
f"{names[i]}{x0_threshold_names[j]}"
for i in range(feats.shape[1])
for j in range(len(x0_thresholds))
]
stlsq3 = ps.optimizers.stlsq.STLSQ(alpha=0.001, threshold=0.25)
stlsq3.fit(feats2[train_idxs], targets[train_idxs])
learned_expr(stlsq3.coef_[0], names2)
/usr/share/miniconda/envs/L96M2lines/lib/python3.9/site-packages/sklearn/utils/extmath.py:189: RuntimeWarning: invalid value encountered in matmul
ret = a @ b
'-2.8051*1*(x0>4) + -0.9871*x_0 + -0.6571*1*(x0>2) + 0.6440*x_0*(x0>4) + -0.5681*1*(x0>0) + -0.5286*1*(x0>1) + -0.2734*1*(x0>5)'
stlsq3.score(feats2[test_idxs], targets[test_idxs])
0.8590951075317006
plt.figure(figsize=(4, 4), dpi=150)
plt.hist2d(inputs[:, 0], targets, bins=50)
fn = (
lambda x0: -2.1200 * 1 * (x0 > 2)
+ -1.2695 * 1 * (x0 > 0)
+ -0.9488 * x0
+ -0.6694 * 1 * (x0 > 4)
+ 0.5175 * x0 * (x0 > 1)
)
xt = np.linspace(-6, 14, 100)
yt = fn(xt)
plt.plot(xt, yt, color="red")
plt.xlim(-6, 14)
plt.xlabel("$x_0$", fontsize=16)
plt.ylabel("Forcing", fontsize=16)
plt.show()
It looks like the model learns a piecewise linear function in a single variable, even though we didn’t explicitly require it!
Genetic programming#
We can also try using genetic programming here, but with a different function set that removes some of the nonlinear operators we included before and replaces them with min and max, which allow for piecewise behavior:
symbolic_reg_pw = gplearn.genetic.SymbolicRegressor(
population_size=5000,
generations=50,
p_crossover=0.7,
p_subtree_mutation=0.1,
p_hoist_mutation=0.05,
p_point_mutation=0.1,
max_samples=0.9,
verbose=1,
parsimony_coefficient=0.001,
const_range=(-2, 2),
function_set=("add", "mul", "max", "min"),
)
symbolic_reg_pw.fit(inputs[train_idxs][:20000], targets[train_idxs][:20000])
| Population Average | Best Individual |
---- ------------------------- ------------------------------------------ ----------
Gen Length Fitness Length Fitness OOB Fitness Time Left
0 27.65 3.78741e+06 7 2.45249 2.54746 5.59m
1 8.00 9.51443 5 1.85843 1.82615 4.06m
2 10.82 7.30098 3 1.6483 1.63764 4.44m
3 8.34 27.6089 3 1.64321 1.68342 4.11m
4 4.92 26.8323 15 1.50401 1.50866 3.67m
5 3.87 16.3644 15 1.44304 1.44976 3.65m
6 6.13 148.18 11 1.40594 1.42375 3.76m
7 12.76 15.4382 13 1.37424 1.38711 4.70m
8 14.48 10.2082 13 1.3658 1.36791 4.90m
9 15.67 12.0612 21 1.35567 1.37664 4.95m
10 15.53 17917.1 23 1.3514 1.40418 4.92m
11 15.51 7.61133 29 1.35019 1.38838 4.71m
12 16.27 4.11042 19 1.34782 1.38404 4.74m
13 15.22 12.9581 23 1.34734 1.38836 4.49m
14 14.50 1540.79 17 1.34519 1.40768 4.27m
15 13.97 5.27408 17 1.34517 1.40786 4.10m
16 14.09 13.3842 17 1.34172 1.43891 4.10m
17 14.21 3.51828 13 1.34363 1.37425 3.89m
18 14.46 84.8708 19 1.34315 1.43713 3.79m
19 14.50 8.1359 15 1.33915 1.37526 3.66m
20 14.49 4.71403 15 1.33942 1.41217 3.50m
21 14.12 4.65016 15 1.33662 1.39804 3.46m
22 13.77 38.3175 19 1.33467 1.42346 3.22m
23 13.67 9.76407 17 1.33539 1.41711 3.09m
24 13.87 9.07149 15 1.33255 1.43467 2.98m
25 14.14 114.952 15 1.33405 1.42116 2.87m
26 14.53 110.894 15 1.33412 1.42058 2.77m
27 14.70 3.25037 19 1.33509 1.41981 2.72m
28 14.72 10.7036 13 1.33059 1.37007 2.50m
29 14.88 3.91762 25 1.32809 1.37427 2.40m
30 14.62 6.29094 15 1.32687 1.39063 2.27m
31 14.57 3.5343 17 1.3249 1.40442 2.15m
32 14.12 3.90761 21 1.32425 1.41418 2.02m
33 13.83 5.69476 13 1.32439 1.42588 1.95m
34 13.48 8.32606 15 1.32568 1.40132 1.77m
35 13.25 79.5995 17 1.32476 1.40569 1.65m
36 13.24 4.45101 13 1.32515 1.41905 1.52m
37 13.04 7.14949 13 1.32524 1.4182 1.39m
38 13.15 5.93211 15 1.32344 1.42146 1.33m
39 13.15 9.69874 17 1.32478 1.40562 1.17m
40 13.18 5.74077 13 1.32703 1.40212 1.05m
41 12.99 9.94645 13 1.32421 1.42748 55.86s
42 12.99 4.36878 13 1.32517 1.41883 49.24s
43 13.12 12.7419 13 1.32593 1.41204 42.90s
44 13.26 3.95264 13 1.32416 1.42798 36.58s
45 13.06 13.1207 21 1.32404 1.42906 28.11s
46 13.04 6.56359 13 1.32464 1.42364 20.94s
47 13.19 6.23439 21 1.32029 1.34429 14.09s
48 13.10 27.2948 21 1.32101 1.33781 7.06s
49 13.26 5.91966 21 1.31816 1.36342 0.00s
add(max(min(mul(max(X0, -1.442), -0.938), X1), -1.937), mul(min(X0, add(max(X5, X3), max(X6, X4))), -0.720))In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
add(max(min(mul(max(X0, -1.442), -0.938), X1), -1.937), mul(min(X0, add(max(X5, X3), max(X6, X4))), -0.720))
print(symbolic_reg_pw._program)
add(max(min(mul(max(X0, -1.442), -0.938), X1), -1.937), mul(min(X0, add(max(X5, X3), max(X6, X4))), -0.720))
print(
"R2 Score:",
r2_score(symbolic_reg_pw.predict(inputs[test_idxs]), targets[test_idxs]),
)
R2 Score: 0.8354791875984894
While this leads to slightly higher \(R^2\) than before, the model is still pretty complex:
# Build the graph
graph = pydot.graph_from_dot_data(symbolic_reg_pw._program.export_graphviz())[0]
graph.write_svg("figs/symbolic_regression_pw.svg")
# Display the graph
from IPython.display import Image, display
display(SVG("figs/symbolic_regression_pw.svg"))
Tuning the hyperparameters could help, but requires more compute.