Source code for mercedestrenz.train

# Author: Ty Andrews
# Date: 2023-01-12
import pandas as pd
from importlib import resources
import joblib

from sklearn.model_selection import cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV


[docs]def train_mercedes_price_prediction_model(
    data: pd.DataFrame,
    model_version: str,
    model_type: str = "gradient_boosting",
    n_iter: int = 25,
    cv_results={},
    save_model: bool = False,
    overwrite_version: bool = False,
):
    """Trains a model to predict the price of a Mercedes-Benz given the year,

    Parameters
    ----------
    data : pd.DataFrame
        The raw used mercedes data. Must contain columns for model, year, condition, odometer_mi, paint_color, and price_USD.
    model_version : str
        The version of the model to train and subsequently save.
    model_type : str, optional
        The type of model to use to train on the data, by default "gradient_boosting"
    n_iter : int, optional
        How many iterations of randomized search to do during tuning, by default 25
    cv_results : dict, optional
        Pass existing dictionary of results to have these results appended, by default {}
    save_model : bool, optional
        Whether to save a version of the model, by default False
    overwrite_version : bool, optional
        If a version of that name already exists use this to overwrite it, by default False

    Returns
    -------
    Tuple[model, cv_results]
        The best performing model and the results of the cross validation.

    Raises
    ------
    ValueError
        If the data does not contain the required columns.

    Examples
    --------
    >>> from mercedestrenz.modelling import train_mercedes_price_prediction_model
    >>> model, results = train_mercedes_price_prediction_model(data, "v2", save_model=False)
    """

    if (
        set(
            ["model", "year", "condition", "odometer_mi", "paint_color", "price_USD"]
        ).issubset(data.columns)
        is False
    ):

        raise ValueError(
            "data must contain columns for model, year, condition, odometer_mi, paint_color, and price_USD"
        )

    else:
        train_data = data.loc[
            :, ["model", "year", "condition", "odometer_mi", "paint_color", "price_USD"]
        ]

    if train_data.isnull().values.any():
        print("Input train_data has null values, dropping any rows with null values.")
        num_samples_before = train_data.shape[0]
        train_data = train_data.dropna()
        num_samples_after = train_data.shape[0]
        print(
            f"Removed {num_samples_before - num_samples_after} rows. {len(train_data)} rows remaining."
        )

    # put the primary metric first for what the model is refit to with all the data
    # at the end of randomized search
    scoring_metrics = ["neg_root_mean_squared_error", "r2"]

    numeric_features = ["year", "odometer_mi"]
    ordinal_features = ["condition"]
    categorical_features = ["model", "paint_color"]

    target = "price_USD"

    X_train = train_data.drop(columns=[target])
    y_train = train_data[target]

    columntransformer = make_column_transformer(
        numeric_features, ordinal_features, categorical_features
    )

    model = make_model(model_type)

    param_grid = get_random_search_param_grid(model_type)

    pipe = make_pipeline(columntransformer, model)

    model_random_search = RandomizedSearchCV(
        pipe,
        param_distributions=param_grid,
        scoring=scoring_metrics,
        refit=scoring_metrics[0],
        n_jobs=-1,
        n_iter=n_iter,
        cv=5,
        return_train_score=True,
        random_state=42,
        verbose=2,
    )

    model_random_search.fit(X_train, y_train)

    best_model = model_random_search.best_estimator_

    model_cv = cross_validate(
        best_model,
        X_train,
        y_train,
        cv=5,
        scoring=scoring_metrics,
        return_train_score=True,
    )

    print(f"Best model: {model_random_search.best_params_}")
    print(
        f"Best model train {scoring_metrics[0]}: {model_random_search.cv_results_[f'mean_train_{scoring_metrics[0]}'][model_random_search.best_index_]:.1f}"
    )
    print(
        f"Best model test {scoring_metrics[0]}: {model_random_search.cv_results_[f'mean_test_{scoring_metrics[0]}'][model_random_search.best_index_]:.2}f"
    )

    cv_results[model_type] = pd.DataFrame(model_cv).agg(["mean", "std"]).round(3).T

    if save_model is True:
        export_mercedes_price_model(best_model, model_version, overwrite_version)

    return best_model, cv_results


[docs]def make_model(model_type: str):
    """Makes a model for the mercedes price prediction model

    Parameters
    ----------
    model_type : str
        What type of model to use

    Returns
    -------
    Model
        A model for the mercedes price prediction model
    """

    if model_type == "gradient_boosting":

        model = GradientBoostingRegressor(loss="squared_error", random_state=42)

    else:
        raise ValueError(f"model_type {model_type} not recognized")

    return model


[docs]def get_random_search_param_grid(model_type: str):
    """Gets a random search parameter grid for the mercedes
    price prediction model

    Parameters
    ----------
    model_type : str
        What type of model to use

    Returns
    -------
    dict
        A random search parameter grid for the mercedes price prediction model
    """

    if model_type == "gradient_boosting":

        param_grid = {
            "gradientboostingregressor__n_estimators": [150, 200, 250],
            "gradientboostingregressor__max_depth": [3, 5, 7, 9],
            "gradientboostingregressor__min_samples_split": [2, 3, 4, 5],
            "gradientboostingregressor__min_samples_leaf": [1, 2, 3, 4],
            "gradientboostingregressor__subsample": [0.5, 0.6, 0.8],
        }

    else:
        raise ValueError(f"model_type {model_type} not recognized")

    return param_grid


[docs]def make_column_transformer(numeric_features, ordinal_features, categorical_features):
    """Makes a column transformer for the mercedes price prediction model

    Parameters
    ----------
    numeric_features : list
        List of numeric features to include in the model
    ordinal_features : list
        List of ordinal features to include in the model
    categorical_features : list
        List of categorical features to include in the model

    Returns
    -------
    ColumnTransformer
        A column transformer for the mercedes price prediction model
    """

    columntransformer = ColumnTransformer(
        [
            ("scaling", StandardScaler(), numeric_features),
            (
                "onehot",
                OneHotEncoder(
                    sparse_output=False,
                    handle_unknown="infrequent_if_exist",
                    min_frequency=5,
                ),
                categorical_features,
            ),
            (
                "ordinal",
                OrdinalEncoder(
                    categories=[
                        [
                            "salvage",
                            "used",
                            "fair",
                            "good",
                            "excellent",
                            "like new",
                            "new",
                        ]
                    ]
                ),
                ordinal_features,
            ),
        ]
    )

    return columntransformer


[docs]def export_mercedes_price_model(model_pipeline, version="v1", overwrite=False):
    """Exports the sklearn model pipeline for mercedes price prediction

    Parameters
    ----------
    model_pipeline : PipeLine
        sklearn pipeline with the model and preprocessing steps
    version : str, optional
        What to tag the model version by. By default "v1"
    """

    model_name = f"mercedes_price_prediction_{version}.joblib"
    with resources.path("mercedestrenz.models", model_name) as d:
        if d.exists() and overwrite is False:
            raise ValueError(
                f"Model version {version} already exists. Set overwrite=True to overwrite."
            )
        joblib.dump(model_pipeline, d)
        print("Model saved to: ", d)