wavegp-public/wavegp_sklearn.py at main · cselab/wavegp-public

498 lines (445 loc) · 17.5 KB
import random
import subprocess
import time
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import wavegp
from engines.ops import (
    NAME_TO_ID as _NAME_TO_OP,
    ARITY as _OP_ARITY,
    HUMAN as _OP_HUMAN,
    validate_function_set as _validate_function_set,
def _bin_path():
    return os.path.join(os.path.dirname(os.path.abspath(__file__)),
                        "engines", "cuda", "test_es_lm")
def backend_info():
    """Return what backends CGPRegressor.fit() can use right now.
    Keys: ``cuda`` and ``jax`` map to a path/devices when importable, else
    None.  ``active`` is the backend ``.fit()`` would pick: ``'cuda'`` if
    wavegp_cuda imports, else ``'jax'`` if jax imports, else None.
    info = {"cuda": None, "jax": None, "active": None}
        import wavegp_cuda
        info["cuda"] = wavegp_cuda.__file__
    except ImportError:
        pass
        import jax
        info["jax"] = [str(d) for d in jax.devices()]
    except ImportError:
        pass
    info["active"] = ("cuda" if info["cuda"]
                      else ("jax" if info["jax"] else None))
    return info
class CGPRegressor(BaseEstimator, RegressorMixin):
    def __init__(
        self,
        population_size=5000,
        generations=2000,
        n_nodes=20,
        function_set=("add", "sub", "mul", "scale", "div"),
        metric="mse",
        stopping_criteria=1e-14,
        mutation_prob=0.1,
        n_mutations=1,
        n_offspring=1,
        tournament_size=1,
        lm_max_iter=10,
        lm_lam0=1e-3,
        feature_names=None,
        random_state=None,
        dump_every=100,
        verbose=0,
        p_point_mutation=None,
        warm_start=False,
        parsimony_coefficient=None,
        self.population_size = population_size
        self.generations = generations
        self.n_nodes = n_nodes
        self.function_set = function_set
        self.metric = metric
        self.stopping_criteria = stopping_criteria
        self.mutation_prob = mutation_prob
        self.n_mutations = n_mutations
        self.n_offspring = n_offspring
        self.tournament_size = tournament_size
        self.lm_max_iter = lm_max_iter
        self.lm_lam0 = lm_lam0
        self.feature_names = feature_names
        self.random_state = random_state
        self.dump_every = dump_every
        self.verbose = verbose
        self.p_point_mutation = p_point_mutation
        self.warm_start = warm_start
        self.parsimony_coefficient = parsimony_coefficient
    def _resolve_aliases(self):
        if self.p_point_mutation is not None:
            self.mutation_prob = self.p_point_mutation
        if self.parsimony_coefficient not in (None, 0.0):
            raise NotImplementedError(
                "parsimony_coefficient is reserved (planned); pass None")
        if self.warm_start:
            raise NotImplementedError(
                "warm_start is reserved (planned); pass False")
    def _gspec(self, n_features, n_outputs=1):
        op_ids = _validate_function_set(self.function_set)
        class g:
            pass
        g.names = tuple(_OP_HUMAN[i] for i in op_ids)
        g.arity = tuple(_OP_ARITY[i] for i in op_ids)
        g.p = 1
        g.i = n_features
        g.n = self.n_nodes
        g.o = n_outputs
        return g
    def fit(self, X, y):
        self._resolve_aliases()
        X = np.asarray(X, dtype=np.float32)
        y = np.asarray(y, dtype=np.float32)
        if X.ndim != 2:
            raise ValueError("X must be 2-D (n_samples, n_features)")
        if y.ndim not in (1, 2):
            raise ValueError("y must be 1-D (n_samples,) or 2-D (n_samples, n_outputs)")
        if y.ndim == 2 and y.shape[0] != X.shape[0]:
            raise ValueError(f"y has {y.shape[0]} rows but X has {X.shape[0]}")
        # Internally always work in 2-D (n_samples, n_outputs).  Track the
        # original shape so predict() / formula_ surfaces match it.
        self._y_was_1d = (y.ndim == 1)
        if self._y_was_1d:
            y = y[:, None]
        self.n_features_in_ = X.shape[1]
        self.n_outputs_ = y.shape[1]
        seeds = self.random_state
        if seeds is None:
            seeds = [42]
        elif isinstance(seeds, (int, np.integer)):
            seeds = [int(seeds)]
        else:
            seeds = list(seeds)
        params = self.get_params()
        if self.verbose:
            print(f"[CGPRegressor] fitting {len(seeds)} seed(s)  "
                  f"G={self.population_size}  gn={self.n_nodes}  "
                  f"N={X.shape[0]}  generations={self.generations}",
                  flush=True)
        def _print_one(r):
            if isinstance(r, Exception):
                print(f"[CGPRegressor]   FAIL  {r}", flush=True)
                return
            print(f"[CGPRegressor]   seed={r['seed']:>5d}  "
                  f"host={r['host']:>14s}  MSE={r['best_fitness']:.3e}  "
                  f"solved_at={r['first_solved_gen']}  "
                  f"wall={r['wall_seconds']:.1f}s", flush=True)
        cb = _print_one if self.verbose else None
        if len(seeds) == 1:
            r = _fit_one_seed(X, y, seeds[0], params)
            if cb: cb(r)
            results = [r]
        else:
            from cowork import pmap
            results = pmap(
                _SeedWorker(X, y, params),
                seeds,
                on_result=cb,
        finite = [r for r in results if not isinstance(r, Exception)]
        if not finite:
            raise RuntimeError(f"All seeds failed: {results}")
        self.cv_results_ = results
        best = min(finite, key=lambda r: r["best_fitness"])
        self.best_seed_ = best["seed"]
        self.best_fitness_ = best["best_fitness"]
        self.best_genome_ = best["genome"]
        self.best_params_ = best["params"]
        # `formula` in the result dict is a list of length n_outputs.
        formulas = best["formula"]
        if isinstance(formulas, str):
            formulas = [formulas]
        self.formulas_ = list(formulas)
        self.formula_ = self.formulas_[0]
        self.first_solved_gen_ = best["first_solved_gen"]
        self.history_ = best["history"]
        self.is_fitted_ = True
        return self
    def predict(self, X):
        if not getattr(self, "is_fitted_", False):
            raise RuntimeError("Call fit() before predict()")
        X = np.asarray(X, dtype=np.float32)
        n_samples, n_features = X.shape
        g = self._gspec(n_features, self.n_outputs_)
        from engines.reference import precompute, forward
        from engines.ops import scalar_ops
        genome = self.best_genome_[None, :, :]
        ptrs, types, output_ptrs = precompute(g, genome)
        # genome op IDs are stored as indices into the function_set; translate
        # back to canonical op IDs so scalar_ops dispatches the right op.
        op_map = np.asarray(_validate_function_set(self.function_set), dtype=np.uint8)
        types = op_map[types]
        params = self.best_params_[None, :]
        inputs = X.T[None, :, :]
        out, _ = forward(scalar_ops, g, params, inputs, ptrs, types, output_ptrs)
        # out shape: (1, n_outputs, n_samples).  Return shape matches fit-time y.
        if self._y_was_1d:
            return out[0, 0]
        return np.asarray(out[0].T)   # (n_samples, n_outputs)
    def equivalent(self, target_expr):
        if not getattr(self, "is_fitted_", False):
            raise RuntimeError("Call fit() before equivalent()")
        feat = self.feature_names or [f"x{i}" for i in range(self.n_features_in_)]
        return wavegp.equivalent(self.formula_, target_expr, vars=feat)
class _SeedWorker:
    def __init__(self, X, y, params):
        self.X = X
        self.y = y
        self.params = params
    def __call__(self, seed):
        return _fit_one_seed(self.X, self.y, seed, self.params)
def _fit_one_seed(X, y, seed, params):
    import os, sys
    here = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, here)
    sys.path.insert(0, os.path.join(here, "engines", "cuda"))
        import wavegp_cuda  # noqa: F401
    except ImportError:
        return _fit_one_seed_jax(X, y, seed, params)
    return _fit_one_seed_cuda(X, y, seed, params)
def _fit_one_seed_cuda(X, y, seed, params):
    import os, sys, random, socket, time
    import numpy as np
    here = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, here)
    sys.path.insert(0, os.path.join(here, "engines", "cuda"))
    import wavegp as wgp
    import wavegp_cuda
    op_ids = _validate_function_set(params["function_set"])
    n_features = X.shape[1]
    n_samples = X.shape[0]
    # y is always 2-D internally: (n_samples, n_outputs).
    if y.ndim == 1:
        y = y[:, None]
    n_outputs = y.shape[1]
    G = params["population_size"]
    gn = params["n_nodes"]
    class g:
        pass
    g.names = tuple(_OP_HUMAN[i] for i in op_ids)
    g.arity = tuple(_OP_ARITY[i] for i in op_ids)
    g.p = 1
    g.i = n_features
    g.n = gn
    g.o = n_outputs
    inputs = np.ascontiguousarray(
        np.broadcast_to(X.T[None, :, :], (G, n_features, n_samples))
        .astype(np.float32))
    # y arrives as (n_samples, n_outputs); broadcast to (G, n_outputs, n_samples).
    y_T = y.T.astype(np.float32)                       # (n_outputs, n_samples)
    targets = np.ascontiguousarray(
        np.broadcast_to(y_T[None, :, :], (G, n_outputs, n_samples)).astype(np.float32))
    gr = random.Random(seed)
    genomes = np.stack([
        np.asarray(wgp.rand(g, random.Random(gr.randint(0, 2**32))))
        for _ in range(G)
    ]).astype(np.uint8)
    dump_every = params["dump_every"]
    n_gens = params["generations"]
    t0 = time.perf_counter()
    op_map = np.asarray(op_ids, dtype=np.uint8)
    trace_g, trace_p, trace_f, gen_secs = wavegp_cuda.run_es_lm(
        G=G, gi=g.i, gn=g.n, go=g.o,
        a=max(g.arity), p=g.p, N=n_samples,
        names_len=len(g.names),
        n_mut=params["n_mutations"],
        prob=float(params["mutation_prob"]),
        n_gens=n_gens,
        max_lm_iter=params["lm_max_iter"],
        lam0=float(params["lm_lam0"]),
        key0=int(seed), key1=7,
        dump_every=dump_every,
        inputs=inputs, targets=targets, parent=genomes,
        verbose=int(params.get("verbose", 0)),
        n_offspring=int(params.get("n_offspring", 1)),
        tournament_size=int(params.get("tournament_size", 1)),
        op_map=op_map,
    wall = time.perf_counter() - t0
    def dump_gens(n, k):
        if k == 0:
            return [0, n]
        gs = list(range(0, n + 1, k))
        if gs[-1] != n:
            gs.append(n)
        return gs
    DUMPED = dump_gens(n_gens, dump_every)
    final_f = trace_f[-1]
    finite = np.isfinite(final_f)
    best_idx = int(np.nanargmin(np.where(finite, final_f, np.inf)))
    best_f = float(final_f[best_idx])
    first_solved_at = None
    stop = params["stopping_criteria"]
    for di, gn_idx in enumerate(DUMPED):
        f = trace_f[di, best_idx]
        if np.isfinite(f) and f < stop:
            first_solved_at = gn_idx
            break
    feat = params["feature_names"]
    if feat is None:
        feat = [f"x{i}" for i in range(n_features)]
    # as_formula returns a list of length g.o (one per output pointer).
    formula = wgp.as_formula(
        g, trace_g[-1, best_idx], trace_p[-1, best_idx], input_names=feat)
    return {
        "seed": seed,
        "best_fitness": best_f,
        "genome": trace_g[-1, best_idx],
        "params": trace_p[-1, best_idx],
        "formula": formula,
        "first_solved_gen": first_solved_at,
        "history": trace_f.mean(axis=1),
        "wall_seconds": wall,
        "host": socket.gethostname().split(".")[0],
# JAX fallback path: used when wavegp_cuda is not importable (no nvcc at
# install time). Mirrors the CUDA loop using wavegp_jax + wavegp_lm.
# Supports scalar ops (id 0..7): plus, minus, mul, scale, div, exp, sin, cos.
_JAX_OPS_BY_ID = {
    0: lambda a, b, p: a + b,
    1: lambda a, b, p: a - b,
    2: lambda a, b, p: a * b,
    3: lambda a, b, p: p * a,
    4: lambda a, b, p: a / b,
def _build_jax_all_ops(op_ids):
    import jax.numpy as jnp
    # Lazy-add transcendentals so non-trig demos don't depend on them being
    # available in this dict (also keeps import order tidy).
    if 5 in op_ids or 6 in op_ids or 7 in op_ids:
        _JAX_OPS_BY_ID.setdefault(5, lambda a, b, p: jnp.exp(a))
        _JAX_OPS_BY_ID.setdefault(6, lambda a, b, p: jnp.sin(a))
        _JAX_OPS_BY_ID.setdefault(7, lambda a, b, p: jnp.cos(a))
    for op in op_ids:
        if op not in _JAX_OPS_BY_ID:
            raise NotImplementedError(
                f"JAX fallback does not support op id {op}; "
                "build wavegp with nvcc or restrict function_set to "
                "{add, sub, mul, scale, div, exp, sin, cos}.")
    fns = [_JAX_OPS_BY_ID[op] for op in op_ids]
    def all_ops(in0, in1, par):
        p = par[:, 0:1]
        return jnp.stack([f(in0, in1, p) for f in fns], axis=1)
    return all_ops
def _fit_one_seed_jax(X, y, seed, params):
    import os, sys, random, socket, time
    import numpy as np
    import jax
    import jax.numpy as jnp
    here = os.path.dirname(os.path.abspath(__file__))
    sys.path.insert(0, here)
    import wavegp as wgp
    import wavegp_jax
    import wavegp_lm
    op_ids = _validate_function_set(params["function_set"])
    n_features = X.shape[1]
    n_samples = X.shape[0]
    G = params["population_size"]
    gn = params["n_nodes"]
    # y is always 2-D internally (n_samples, n_outputs); promote 1-D for safety
    if y.ndim == 1:
        y = y[:, None]
    n_outputs = y.shape[1]
    class g:
        pass
    g.names = tuple(_OP_HUMAN[i] for i in op_ids)
    g.arity = tuple(_OP_ARITY[i] for i in op_ids)
    g.p = 1
    g.i = n_features
    g.n = gn
    g.o = n_outputs
    inputs_j = jnp.broadcast_to(
        X.T[None, :, :], (G, n_features, n_samples)).astype(jnp.float32)
    # y arrives as (n_samples, n_outputs) -> broadcast to (G, n_outputs, n_samples)
    y_T = jnp.asarray(y.T, dtype=jnp.float32)
    targets_j = jnp.broadcast_to(
        y_T[None, :, :], (G, n_outputs, n_samples)).astype(jnp.float32)
    init_key, key = jax.random.split(jax.random.PRNGKey(int(seed)))
    genomes_j = wavegp_jax.rand_population(g, init_key, G)
    all_ops_jax = _build_jax_all_ops(op_ids)
    lm_solve = wavegp_lm.make_lm_solver(
        g, all_ops_jax, inputs_j, targets_j,
        max_iter=int(params["lm_max_iter"]),
        lam0=float(params["lm_lam0"]),
        tol=1e-10)
    ptrs, types, output_ptrs = wavegp_jax.precompute(g, genomes_j)
    p_arr, _, _, _ = lm_solve(
        jnp.zeros((G, g.n * g.p)), ptrs, types, output_ptrs)
    out, _, _ = wavegp_jax.forward(
        all_ops_jax, g, p_arr, inputs_j, ptrs, types, output_ptrs)
    fit = jnp.mean((out[:, 0] - targets_j[:, 0]) ** 2, axis=1)
    fit = jnp.where(jnp.isnan(fit), jnp.inf, fit)
    n_gens = int(params["generations"])
    n_mut = int(params["n_mutations"])
    prob = float(params["mutation_prob"])
    stop = float(params["stopping_criteria"])
    verbose = int(params.get("verbose", 0))
    dump_every = max(1, int(params.get("dump_every", 100)))
    history = [float(jnp.mean(fit))]
    first_solved = None
    if float(jnp.min(fit)) < stop:
        first_solved = 0
    if verbose >= 2:
        print(f"[CGPRegressor jax]   gen    0  "
              f"best={float(jnp.min(fit)):.3e}  "
              f"mean={history[-1]:.3e}", flush=True)
    t0 = time.perf_counter()
    for gen in range(1, n_gens + 1):
        key, sub = jax.random.split(key)
        child = wavegp_jax.mutate(genomes_j, sub, g, n_mut, prob)
        cp, ct, co = wavegp_jax.precompute(g, child)
        cp_, _, _, _ = lm_solve(jnp.zeros((G, g.n * g.p)), cp, ct, co)
        cout, _, _ = wavegp_jax.forward(
            all_ops_jax, g, cp_, inputs_j, cp, ct, co)
        cfit = jnp.mean((cout[:, 0] - targets_j[:, 0]) ** 2, axis=1)
        cfit = jnp.where(jnp.isnan(cfit), jnp.inf, cfit)
        imp = cfit < fit
        genomes_j   = jnp.where(imp[:, None, None], child, genomes_j)
        p_arr       = jnp.where(imp[:, None],       cp_,   p_arr)
        fit         = jnp.where(imp,                cfit,  fit)
        ptrs        = jnp.where(imp[:, None, None], cp,    ptrs)
        types       = jnp.where(imp[:, None],       ct,    types)
        output_ptrs = jnp.where(imp[:, None],       co,    output_ptrs)
        history.append(float(jnp.mean(fit)))
        if first_solved is None and float(jnp.min(fit)) < stop:
            first_solved = gen
        if verbose >= 2 and (gen % dump_every == 0 or gen == n_gens):
            print(f"[CGPRegressor jax]   gen {gen:4d}  "
                  f"best={float(jnp.min(fit)):.3e}  "
                  f"mean={history[-1]:.3e}  "
                  f"improved={int(imp.sum()):3d}/{G}", flush=True)
    wall = time.perf_counter() - t0
    genomes_np = np.asarray(genomes_j, dtype=np.uint8)
    p_np = np.asarray(p_arr)
    fit_np = np.asarray(fit)
    finite = np.isfinite(fit_np)
    best_idx = int(np.nanargmin(np.where(finite, fit_np, np.inf)))
    best_f = float(fit_np[best_idx])
    feat = params["feature_names"] or [f"x{i}" for i in range(n_features)]
    formula = wgp.as_formula(
        g, genomes_np[best_idx], p_np[best_idx], input_names=feat)[0]
    return {
        "seed": int(seed),
        "best_fitness": best_f,
        "genome": genomes_np[best_idx],
        "params": p_np[best_idx],
        "formula": formula,
        "first_solved_gen": first_solved,
        "history": np.asarray(history, dtype=np.float64),
        "wall_seconds": wall,
        "host": socket.gethostname().split(".")[0],
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

wavegp_sklearn.py

Latest commit

History

wavegp_sklearn.py

File metadata and controls