Advanced Features

This guide covers the advanced features added to PyGeomodeling for production use.

Model Serialization

Save and load trained models with complete metadata and versioning.

Basic Usage

from spe9_geomodeling import save_model, load_model
from sklearn.gaussian_process import GaussianProcessRegressor

# Train your model
model = GaussianProcessRegressor()
model.fit(X_train, y_train)

# Save with metadata
save_model(
    model=model,
    model_name="my_gpr_model",
    model_type="gpr",
    backend="sklearn",
    save_dir="saved_models",
    scaler=scaler,  # Optional
    metrics={"r2": 0.85, "mse": 0.12},  # Optional
    description="Production model trained on SPE9 dataset"
)

Loading Models

# Load the model
model, metadata, scaler = load_model("my_gpr_model", save_dir="saved_models")

# Check metadata
print(f"Model: {metadata.model_name}")
print(f"Type: {metadata.model_type}")
print(f"Created: {metadata.created_at}")
print(f"Metrics: {metadata.performance_metrics}")

# Use the model
predictions = model.predict(X_test)

Advanced Serialization

from spe9_geomodeling import ModelSerializer, ModelMetadata

# Create serializer
serializer = ModelSerializer(save_dir="models")

# Create detailed metadata
metadata = ModelMetadata(
    model_name="production_gpr_v1",
    model_type="gpr",
    backend="sklearn",
    version="1.0"
)

# Add training information
metadata.add_training_info(
    n_samples=len(X_train),
    n_features=X_train.shape[1],
    feature_names=["x", "y", "z"],
    training_time=15.3
)

# Add hyperparameters
metadata.add_hyperparameters({
    "kernel": "RBF + Matern",
    "alpha": 1e-10,
    "n_restarts_optimizer": 10
})

# Save with full metadata
model_dir = serializer.save_model(model, metadata, scaler)

# List all saved models
models = serializer.list_models()
print(f"Saved models: {models}")

# Get model info without loading
info = serializer.get_model_info("production_gpr_v1")

Spatial Cross-Validation

Proper cross-validation for spatial data that accounts for spatial autocorrelation.

Spatial K-Fold

from spe9_geomodeling import SpatialKFold, cross_validate_spatial
from sklearn.gaussian_process import GaussianProcessRegressor

# Create spatial CV splitter
cv = SpatialKFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
model = GaussianProcessRegressor()
results = cross_validate_spatial(
    model=model,
    X=X,  # Must have spatial coordinates in first 3 columns
    y=y,
    cv=cv,
    scoring="r2",
    return_train_score=True,
    verbose=True
)

print(f"Test R²: {results['test_score'].mean():.4f} ± {results['test_score'].std():.4f}")
print(f"Train R²: {results['train_score'].mean():.4f} ± {results['train_score'].std():.4f}")

Block Cross-Validation

from spe9_geomodeling import BlockCV

# Create block CV with 3x3x1 blocks
cv = BlockCV(n_blocks_x=3, n_blocks_y=3, n_blocks_z=1, buffer_size=0.1)

# Use with cross-validation
results = cross_validate_spatial(model, X, y, cv=cv)

Hyperparameter Tuning with Optuna

from spe9_geomodeling import HyperparameterTuner
from sklearn.ensemble import RandomForestRegressor

# Define parameter search space
param_space = {
    "n_estimators": {"type": "int", "low": 50, "high": 500},
    "max_depth": {"type": "int", "low": 3, "high": 20},
    "min_samples_split": {"type": "int", "low": 2, "high": 20},
    "min_samples_leaf": {"type": "int", "low": 1, "high": 10},
}

# Create tuner
tuner = HyperparameterTuner(
    model_class=RandomForestRegressor,
    param_space=param_space,
    cv=5,
    n_trials=100,
    scoring="r2",
    random_state=42
)

# Run tuning
results = tuner.tune(X, y, verbose=True)

# Get best model
best_model = tuner.get_best_model()
best_model.fit(X_train, y_train)

# Access optimization history
study = results["study"]
print(f"Best parameters: {results['best_params']}")
print(f"Best score: {results['best_score']:.4f}")

Parallel Processing

Speed up model training and predictions using parallel processing.

Parallel Model Training

from spe9_geomodeling import ParallelModelTrainer
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Define multiple models
models = {
    "gpr_rbf": GaussianProcessRegressor(),
    "random_forest": RandomForestRegressor(n_estimators=100),
    "svr": SVR(kernel="rbf"),
}

# Train all models in parallel
trainer = ParallelModelTrainer(n_jobs=-1, verbose=1)
trained_models = trainer.train_models(models, X_train, y_train)

# Train and evaluate in parallel
results = trainer.train_and_evaluate(
    models=models,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

# View results
for name, result in results.items():
    print(f"\n{name}:")
    print(f"  R²: {result['metrics']['r2']:.4f}")
    print(f"  MSE: {result['metrics']['mse']:.4f}")
    print(f"  Training time: {result['training_time']:.2f}s")

Batch Predictions

from spe9_geomodeling import BatchPredictor

# Create batch predictor
predictor = BatchPredictor(n_jobs=-1, batch_size=1000, verbose=True)

# Make predictions in parallel batches
predictions = predictor.predict(model, X_large_dataset)

# For GP models with uncertainty
predictions, std_devs = predictor.predict(gp_model, X, return_std=True)

# Predict with multiple models
predictions_dict = predictor.predict_multiple_models(trained_models, X)

Parallel Grid Search

from spe9_geomodeling import parallel_grid_search
from sklearn.ensemble import RandomForestRegressor

# Define parameter grid
param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [5, 10, 15, 20],
    "min_samples_split": [2, 5, 10],
}

# Run parallel grid search
results = parallel_grid_search(
    model_class=RandomForestRegressor,
    param_grid=param_grid,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    n_jobs=-1,
    verbose=True
)

print(f"Best parameters: {results['best_params']}")
print(f"Best score: {results['best_score']:.4f}")

Parallel Cross-Validation

from spe9_geomodeling import ParallelCrossValidator, SpatialKFold

# Create parallel CV
cv_parallel = ParallelCrossValidator(n_jobs=-1, verbose=True)

# Run cross-validation with parallel fold evaluation
cv_splitter = SpatialKFold(n_splits=10)
results = cv_parallel.cross_validate(
    model=model,
    X=X,
    y=y,
    cv_splitter=cv_splitter
)

print(f"Mean score: {results['mean_score']:.4f} ± {results['std_score']:.4f}")

Error Handling

All modules now include comprehensive error handling with helpful suggestions.

Custom Exceptions

from spe9_geomodeling import exceptions

try:
    parser = GRDECLParser("nonexistent.grdecl")
    data = parser.load_data()
except exceptions.DataLoadError as e:
    print(e)  # Includes helpful suggestion

Common Error Scenarios

# File not found
try:
    data = load_spe9_data("missing_file.grdecl")
except exceptions.DataLoadError as e:
    print(e.message)
    print(e.suggestion)

# Invalid format
try:
    parser = GRDECLParser("invalid.txt")
    data = parser.load_data()
except exceptions.FileFormatError as e:
    print(e)

# Property not found
try:
    slice_data = parser.get_property_slice("INVALID_PROP")
except exceptions.PropertyNotFoundError as e:
    print(e)

# Model not trained
try:
    predictions = model.predict(X_test)
except exceptions.ModelNotTrainedError as e:
    print(e.suggestion)

Complete Workflow Example

from spe9_geomodeling import (
    load_spe9_data,
    UnifiedSPE9Toolkit,
    SpatialKFold,
    HyperparameterTuner,
    ParallelModelTrainer,
    save_model,
    cross_validate_spatial,
)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor

# 1. Load data
data = load_spe9_data()

# 2. Prepare features
toolkit = UnifiedSPE9Toolkit()
toolkit.load_spe9_data(data)
X_train, X_test, y_train, y_test = toolkit.create_train_test_split()

# 3. Hyperparameter tuning with spatial CV
param_space = {
    "n_estimators": {"type": "int", "low": 50, "high": 300},
    "max_depth": {"type": "int", "low": 5, "high": 20},
}

tuner = HyperparameterTuner(
    model_class=RandomForestRegressor,
    param_space=param_space,
    cv=SpatialKFold(n_splits=5),
    n_trials=50,
    random_state=42
)

tuning_results = tuner.tune(X_train, y_train)
best_model = tuner.get_best_model()

# 4. Train multiple models in parallel
models = {
    "tuned_rf": best_model,
    "gpr": GaussianProcessRegressor(),
    "rf_default": RandomForestRegressor(),
}

trainer = ParallelModelTrainer(n_jobs=-1)
results = trainer.train_and_evaluate(
    models, X_train, y_train, X_test, y_test
)

# 5. Save best model
best_name = max(results.keys(), key=lambda k: results[k]["metrics"]["r2"])
best_model_obj = results[best_name]["model"]

save_model(
    model=best_model_obj,
    model_name=f"production_{best_name}",
    model_type=best_name,
    backend="sklearn",
    metrics=results[best_name]["metrics"],
    hyperparameters=tuning_results["best_params"]
)

print(f"\n✓ Best model: {best_name}")
print(f"  R²: {results[best_name]['metrics']['r2']:.4f}")
print(f"  Saved to: production_{best_name}")

GPU Support (GPyTorch Models)

For GPyTorch models, GPU acceleration is automatically used when available.

import torch
from spe9_geomodeling import UnifiedSPE9Toolkit

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Create toolkit with GPyTorch backend
toolkit = UnifiedSPE9Toolkit(backend="gpytorch")
toolkit.load_data()

# Models will automatically use GPU if available
model = toolkit.create_gpytorch_model("deep_gp")
toolkit.train_gpytorch_model(model, "deep_gp_gpu")

Best Practices

Always use spatial cross-validation for geostatistical data
Save models with metadata for reproducibility
Use parallel processing for large datasets or multiple models
Tune hyperparameters with Optuna for optimal performance
Handle exceptions gracefully with try-except blocks
Version your models when deploying to production
Document hyperparameters in model metadata
Use batch predictions for very large datasets