RANDOM_STATE = 42

%matplotlib inline
from pathlib import Path

import numpy as np
import pandas as pd
import rich
import scipy
import scipy.stats as stats
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import TimeSeriesSplit

# Set a global aesthetic theme
sns.set_theme(style="whitegrid", context="notebook", palette="muted")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12
pd.set_option("display.max_columns", None)

# Loading the data
maimai_path = Path("maimai.csv")
bpm_path = Path("bpm.csv")

maimai_df = pd.read_csv(maimai_path)
bpm_df = pd.read_csv(bpm_path)

maimai_df

bpm_df

# Merging datasets
maimai_df.rename(columns={"Song": "songId"}, inplace=True)
merged_df = maimai_df.merge(bpm_df, on="songId", how="left")
print(f"Duplicated: {merged_df.duplicated().sum()}")
merged_df.head()

Duplicated: 0

# Clean up unnecessary columns
merged_df.drop(columns=["FC/AP", "Sync", "DX ✦", "DX %"], inplace=True)

# Expose missing values
merged_df["Chart Constant"] = pd.to_numeric(
    merged_df["Chart Constant"], errors="coerce"
)

# convert releaseDate to numerical values
merged_df["releaseDate"] = pd.to_datetime(merged_df["releaseDate"], errors="coerce")
merged_df["releaseDate"] = (
    merged_df["releaseDate"] - merged_df["releaseDate"].min()
).dt.days
merged_df.head(3)

print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
merged_df[merged_df.isna().any(axis=1)]

Total rows with missing values: 15
Rows with missing values:

# Fill missing Chart Constant values based on Level
level_mapping = merged_df.groupby("Level")["Chart Constant"].median()
print(level_mapping)

merged_df.fillna(
    {"Chart Constant": merged_df["Level"].map(level_mapping)}, inplace=True
)

print(
    f"\nRemaining missing Chart Constant values: {merged_df['Chart Constant'].isna().sum()}"
)
merged_df[["Level", "Chart Constant"]].head(10)

Level
10     10.20
10+    10.70
11     11.20
11+    11.80
12     12.35
12+    12.80
13     13.00
6       6.00
6+      6.80
7       7.20
7+      7.80
8       8.20
8+      8.70
9       9.30
9+      9.80
Name: Chart Constant, dtype: float64

Remaining missing Chart Constant values: 0

# Simply map Level "6" -> 6.2, "6+" -> 6.8 for remaining missing values


def map_level_to_constant(level):
    if isinstance(level, str) and level.endswith("+"):
        return float(level[:-1]) + 0.8
    else:
        try:
            return float(level) + 0.2
        except ValueError:
            return None


merged_df["Chart Constant"] = merged_df.apply(
    lambda row: (
        map_level_to_constant(row["Level"])
        if pd.isna(row["Chart Constant"])
        else row["Chart Constant"]
    ),
    axis=1,
)

print(
    f"\nRemaining missing Chart Constant values: {merged_df['Chart Constant'].isna().sum()}"
)
merged_df[["Level", "Chart Constant"]].head(10)

Remaining missing Chart Constant values: 0

# Drop easy charts
# merged_df = merged_df[merged_df["Chart Constant"] >= 10.5]

merged_df["DX Score"] = (
    merged_df["DX Score"].astype(str).str.split("/").str[1].astype(int)
)
merged_df.head(3)

# Remove the % suffix from the "Achv" column and convert it to float
merged_df["Achv"] = merged_df["Achv"].str.rstrip("%").astype(float)
merged_df.head(5)

# display lowest Achv
merged_df.nsmallest(10, "Achv")

# drop Achv outliners
merged_df = merged_df[(merged_df["Achv"] >= 85)]

# song map metrics
songs_df = Path("songs.csv")
songs_df = pd.read_csv(songs_df)
songs_df.head()

# player-map relation metrics
playcount_df = Path("playcount.csv")
playcount_df = pd.read_csv(playcount_df)
playcount_df.head()

# Shift last played by 1 hour early due to Japan timezone
playcount_df["Last Played"] = pd.to_datetime(playcount_df["Last Played"]) - pd.Timedelta(hours=1)
# Feature engineer Year/Month/Daytime from Last Played
playcount_df["Last Played_Year"] = pd.to_datetime(playcount_df["Last Played"]).dt.year
playcount_df["Last Played_Month"] = pd.to_datetime(playcount_df["Last Played"]).dt.month
playcount_df["Last Played_Day"] = pd.to_datetime(playcount_df["Last Played"]).dt.day
# playcount_df["Last Played_Daytime"] = pd.to_datetime(playcount_df["Last Played"]).dt.hour
# playcount_df["Last Played_Daytime"] = pd.cut(
#     playcount_df["Last Played_Daytime"],
#     bins=[-1, 12, 18, 24],
#     labels=["Morning", "Afternoon", "Night"],
#     right=True,
# )
playcount_df["Last Played"] = pd.to_numeric(playcount_df["Last Played"])
playcount_df.head()

# Match columns with merged_df
playcount_df["Difficulty"] = playcount_df["Difficulty"].str.upper()
playcount_df.rename(columns={"Version": "Chart"}, inplace=True)
# Drop redundancy columns
playcount_df.drop(columns=["Achievement", "Level"], inplace=True)

# Merge w.r.t. Song name & Version (DX/STD) & Difficulty (basic/advanced/expert/master)
merged_df = merged_df.merge(
    playcount_df,
    left_on=["songId", "Chart", "Difficulty"],
    right_on=["Name", "Chart", "Difficulty"],
    how="left",
)
merged_df.head()

songs_df["songId"] = songs_df["songId"].str.replace('"', "")

merged_df.rename(columns={"Chart": "type", "Difficulty": "difficulty"}, inplace=True)
merged_df["type"] = merged_df["type"].str.lower()
merged_df["difficulty"] = merged_df["difficulty"].str.lower()

merged_df = merged_df.merge(
    songs_df, on=["songId", "type", "difficulty"], how="left", copy=False
)

# Check new dataset for missing values
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
merged_df[merged_df.isna().any(axis=1)]

Total rows with missing values: 165
Rows with missing values:

note_count_columns = [
    "noteCounts.tap",
    "noteCounts.hold",
    "noteCounts.slide",
    "noteCounts.touch",
    "noteCounts.break",
]

for col in note_count_columns:
    if col in merged_df.columns:
        merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")

merged_df[note_count_columns] = merged_df[note_count_columns].fillna(0)

total_notes = merged_df[note_count_columns].sum(axis=1).replace(0, 1)
merged_df["noteCounts.total"] = merged_df["DX Score"] / 3

merged_df[note_count_columns] = (
    merged_df[note_count_columns]
    .multiply(merged_df["noteCounts.total"] / total_notes, axis=0)
    .round()
)

print(
    f"\nRemaining missing noteCounts values:\n{merged_df[note_count_columns].isna().sum()}"
)
merged_df[merged_df[note_count_columns].isna().any(axis=1)]

Remaining missing noteCounts values:
noteCounts.tap      0
noteCounts.hold     0
noteCounts.slide    0
noteCounts.touch    0
noteCounts.break    0
dtype: int64

# Check new dataset for missing values
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
# Hide cols with no missing values
merged_df[merged_df.isna().any(axis=1)].head()

Total rows with missing values: 165
Rows with missing values:

# SongId specific feature engineering
# * Length of songId
# * language classifier (English/Japanese)
# * HasPunctuation
merged_df["songId_length"] = merged_df["songId"].str.strip().str.len()
merged_df["songId_English"] = (
    merged_df["songId"].str.count(r"[A-Za-z0-9]").fillna(0).astype(int)
)
merged_df["songId_Japanese"] = (
    merged_df["songId"]
    .str.count(r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\uFF65-\uFF9F]")
    .fillna(0)
    .astype(int)
)
merged_df["songId_Punctuation"] = (
    merged_df["songId"].str.count(r"[^\w\s]").fillna(0).astype(int)
)
merged_df.columns

Index(['songId', 'Genre', 'Version', 'type', 'difficulty', 'Level', 'Achv',
       'Rank', 'DX Score', 'Chart Constant', 'bpm', 'releaseDate', 'Name',
       'Play Count', 'Last Played', 'Last Played_Year', 'Last Played_Month',
       'Last Played_Day', 'noteCounts.tap', 'noteCounts.hold',
       'noteCounts.slide', 'noteCounts.touch', 'noteCounts.break',
       'noteCounts.total', 'noteDesigner', 'songId_length', 'songId_English',
       'songId_Japanese', 'songId_Punctuation'],
      dtype='object')

# Unique versions
print(merged_df["Version"].unique())
# Sort versions properly
versions = [
    "maimai",
    "maimai PLUS",
    "GreeN",
    "GreeN PLUS",
    "ORANGE",
    "ORANGE PLUS",
    "PiNK",
    "PiNK PLUS",
    "MURASAKi",
    "MURASAKi PLUS",
    "MiLK",
    "MiLK PLUS",
    "FiNALE",
    "maimaiでらっくす",
    "maimaiでらっくす PLUS",
    "Splash",
    "Splash PLUS",
    "UNiVERSE",
    "UNiVERSE PLUS",
    "FESTiVAL",
    "FESTiVAL PLUS",
    "BUDDiES",
    "BUDDiES PLUS",
    "PRiSM (beta)",
    "PRiSM PLUS (beta)",
    "CiRCLE",
    # TODO: Don't hardcode this. Also, this is not needed for tree-based models.
    # We use this only for linear models. If we remove Ridge, we can cut this liability.
]
version_order = {version: i + 1 for i, version in enumerate(versions)}

merged_df["Version"] = merged_df["Version"].map(version_order)
merged_df["Version"]

['UNiVERSE' 'BUDDiES' 'BUDDiES PLUS' 'FESTiVAL' 'MiLK' 'FiNALE'
 'maimaiでらっくす' 'PiNK PLUS' 'MURASAKi' 'MURASAKi PLUS' 'PRiSM (beta)'
 'ORANGE' 'MiLK PLUS' 'UNiVERSE PLUS' 'FESTiVAL PLUS' 'PiNK' 'ORANGE PLUS'
 'maimai PLUS' 'PRiSM PLUS (beta)' 'maimaiでらっくす PLUS' 'Splash'
 'Splash PLUS' 'maimai' 'CiRCLE (experimental)' 'GreeN' 'GreeN PLUS']

0      18.0
1      22.0
2      18.0
3      23.0
4      20.0
       ... 
531    17.0
532    12.0
533    13.0
534    14.0
535     7.0
Name: Version, Length: 536, dtype: float64

# Convert "type" into simple numeric categories
type_mapping = {"std": 0, "dx": 1}
merged_df["type"] = merged_df["type"].map(type_mapping)
merged_df.head(3)

# Convert "difficulty" into simple numeric categories
difficulty_mapping = {
    "basic": 0,
    "advanced": 1,
    "expert": 2,
    "master": 3,
    "re:master": 4,
}
merged_df["difficulty"] = merged_df["difficulty"].map(difficulty_mapping)
merged_df.head(3)

merged_df.describe(include="all")

# Filter out numerical columns for correlation matrix
correlation_matrix = merged_df.select_dtypes(include=["number"]).corr()

fig = plt.figure(clear=True, figsize=(14, 12))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    square=True,
    cmap="coolwarm",
    center=0,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix", fontsize=16, pad=20)
plt.show()

def correlation_rankings(
    corr_mat: pd.DataFrame, target_column: str, threshold: float = 0.95
):
    """Print all abs(correlation with anything) that exceeds a threshold"""
    output = []

    for col in corr_mat.columns:
        if col == target_column:
            continue

        strong_correlations = (
            corr_mat[col][corr_mat[col].abs() > threshold]
            .drop(col)
            .sort_values(ascending=False, key=lambda x: x.abs())
        )

        if not strong_correlations.empty:
            formatted_corrs: list = []
            for corr_col, value in strong_correlations.items():
                color: str = "bold green" if value > 0 else "bold red"
                formatted_corrs.append(f"[{color}]{corr_col} ({value:.2f})[/{color}]")

            corr_with_target = corr_mat.loc[col, target_column]
            correlations_str = ", ".join(formatted_corrs)

            header = "".join(
                [
                    f"[bold sky_blue1]{col} ({target_column}: {corr_with_target:.2f}, ",
                    f"count: {len(strong_correlations)}):[/bold sky_blue1]",
                ]
            )
            full_line = f"{header} [{correlations_str}]"

            output.append(full_line)

    output.sort(
        key=lambda x: float(x.split(f"({target_column}: ")[1].split(",")[0]),
        reverse=True,
    )
    if output:
        rich.print("\n".join(output))


correlation_rankings(correlation_matrix, target_column="Achv")

DX Score (Achv: -0.07, count: 1): [noteCounts.total (1.00)]
noteCounts.total (Achv: -0.07, count: 1): [DX Score (1.00)]

# Drop collinear columns
merged_df.drop(
    columns=[
        "DX Score",
    ],
    inplace=True,
)

correlation_matrix = merged_df.select_dtypes(include=["number"]).corr()
correlation_rankings(correlation_matrix, target_column="Achv")

fig = plt.figure(clear=True, figsize=(14, 12))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    square=True,
    cmap="coolwarm",
    center=0,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix (Reduced)", fontsize=16, pad=20)
plt.show()

# Pairwise correlation with seaborn
sns.pairplot(
    merged_df,
    diag_kind="kde",
    plot_kws={"alpha": 0.4, "s": 15},
    corner=True,
    height=1.7,
)
plt.show()

print(merged_df.columns)

numerical_features = (
    merged_df.select_dtypes(include=["number"]).columns.drop("Achv").tolist()
)
categorical_features = ["Genre"]

drop_columns = [
    feature
    for feature in merged_df.columns
    if feature not in numerical_features + categorical_features + ["Achv"]
]
print(f"Dropping columns: {drop_columns}")
merged_df.drop(
    columns=drop_columns,
    inplace=True,
)

merged_df.dropna(inplace=True)

Index(['songId', 'Genre', 'Version', 'type', 'difficulty', 'Level', 'Achv',
       'Rank', 'Chart Constant', 'bpm', 'releaseDate', 'Name', 'Play Count',
       'Last Played', 'Last Played_Year', 'Last Played_Month',
       'Last Played_Day', 'noteCounts.tap', 'noteCounts.hold',
       'noteCounts.slide', 'noteCounts.touch', 'noteCounts.break',
       'noteCounts.total', 'noteDesigner', 'songId_length', 'songId_English',
       'songId_Japanese', 'songId_Punctuation'],
      dtype='object')
Dropping columns: ['songId', 'Level', 'Rank', 'Name', 'noteDesigner']

# merged_df.to_csv("maimai.cleaned.csv", index=False)

merged_df.sort_values(by="Last Played", inplace=True)

testset_size = 0.5
X_train, X_test = merged_df.iloc[
    : int(len(merged_df) * (1 - testset_size)), :
], merged_df.iloc[
    int(len(merged_df) * (1 - testset_size)) :, :
]
y_train, y_test = X_train.pop("Achv"), X_test.pop("Achv")

# Visualize distribution of Achv
plt.figure(figsize=(6, 3))
sns.histplot(y_train, bins=100, kde=True)
plt.title("Distribution of y_train")
plt.xlabel("Achv (%)")
plt.show()

TRANSFORM_ACHV = ["box-cox", "log", "none"][0]

if TRANSFORM_ACHV == "box-cox":
    # Convert non-linear Achv score to a (almost) gaussian like distribution
    y_train, achv_boxcox_lambda = stats.boxcox(y_train)
    y_test = stats.boxcox(y_test, lmbda=achv_boxcox_lambda)
    # The target variable now has crazy large values, so we normalize it
    scaler = StandardScaler()
    y_train = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
    y_test = scaler.transform(y_test.reshape(-1, 1)).flatten()
    print(f"Achv Box-Cox Lambda: {achv_boxcox_lambda}")
elif TRANSFORM_ACHV == "log":
    y_train = np.log(y_train / (101 - y_train))
    y_test = np.log(y_test / (101 - y_test))
else:
    pass

plt.figure(figsize=(6, 3))
sns.histplot(y_train, bins=100, kde=True)
plt.title(f"Distribution of y_train after {TRANSFORM_ACHV} Transformation")
plt.xlabel("Achv")
plt.ylabel("Frequency")
plt.show()


def Achv_inverse_transform(y_transformed):
    if TRANSFORM_ACHV == "box-cox":
        # Inverse Box-Cox transformation
        y = np.asarray(y_transformed).reshape(-1, 1)
        y = scaler.inverse_transform(y).flatten()
        y = scipy.special.inv_boxcox(y, achv_boxcox_lambda)
    elif TRANSFORM_ACHV == "log":
        y = 101 / (1 + np.exp(-y_transformed))
    else:
        y = y_transformed
    return y

Achv Box-Cox Lambda: 27.019015925780376

preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(
                steps=[
                    (
                        "poly",
                        PolynomialFeatures(
                            degree=1, include_bias=False, interaction_only=False
                        ),
                    ),
                    ("scaler", StandardScaler()),
                ]
            ),
            numerical_features,
        ),
        (
            "cat",
            OneHotEncoder(
                sparse_output=False,
                handle_unknown="error",
                categories=[
                    merged_df[column].unique().tolist()
                    for column in categorical_features
                ],
            ),
            categorical_features,
        ),
    ]
)
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "regressor",
            ElasticNetCV(
                max_iter=100000,
                l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
                random_state=RANDOM_STATE,
                cv=TimeSeriesSplit(n_splits=5),
            ),
        ),
    ]
)

pipeline.fit(X_train, y_train)
chosen_alpha = pipeline.named_steps["regressor"].alpha_
print("Chosen alpha:", chosen_alpha)
l1_ratio = pipeline.named_steps["regressor"].l1_ratio_
print("Chosen l1_ratio:", l1_ratio)

Chosen alpha: 0.06262986919313132
Chosen l1_ratio: 1.0

y_train_pred = pipeline.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_mse = mean_absolute_error(y_train, y_train_pred)
train_mse_transformed = mean_absolute_error(
    Achv_inverse_transform(y_train), Achv_inverse_transform(y_train_pred)
)
print("Train set performance:")
print(f"R^2 Score: {train_r2:.4f}")
print(f"Mean Absolute Error: {train_mse:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {train_mse_transformed:.4f}")

y_test_pred = pipeline.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_absolute_error(y_test, y_test_pred)
test_mse_transformed = mean_absolute_error(
    Achv_inverse_transform(y_test), Achv_inverse_transform(y_test_pred)
)
print("Test set performance:")
print(f"R^2 Score: {test_r2:.4f}")
print(f"Mean Absolute Error: {test_mse:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {test_mse_transformed:.4f}")

cat_encoder = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)

all_feature_names = (
    list(cat_feature_names)
    + pipeline.named_steps["preprocessor"]
    .named_transformers_["num"]
    .named_steps["poly"]
    .get_feature_names_out(numerical_features)
    .tolist()
)

regressor_step = pipeline.named_steps["regressor"]

results_df = pd.DataFrame(
    {
        "Feature": [all_feature_names[i] for i in range(len(all_feature_names))],
        "Coefficient": regressor_step.coef_
    }
).sort_values(by="Coefficient", ascending=False, key=abs)
results_df = results_df[results_df["Coefficient"] != 0]
display(results_df)

Train set performance:
R^2 Score: 0.5654
Mean Absolute Error: 0.5362
Mean Absolute Error (Inverse Transformed): 1.1676
Test set performance:
R^2 Score: 0.3839
Mean Absolute Error: 0.6909
Mean Absolute Error (Inverse Transformed): 1.3568

y_train_inv = Achv_inverse_transform(y_train)
y_train_pred_inv = Achv_inverse_transform(y_train_pred)
y_test_inv = Achv_inverse_transform(y_test)
y_test_pred_inv = Achv_inverse_transform(y_test_pred)

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Common scatter kwargs
scatter_kws = {"alpha": 0.6, "s": 40, "edgecolor": "white", "linewidth": 0.5}
line_kws = {"color": "#333333", "linestyle": "--", "linewidth": 2, "alpha": 0.8}

# Predicted vs Actual for training set
axes[0].scatter(y_train_inv, y_train_pred_inv, **scatter_kws)
axes[0].plot(
    [y_train_inv.min(), y_train_inv.max()],
    [y_train_inv.min(), y_train_inv.max()],
    **line_kws,
)
axes[0].set_xlabel("Actual Achv")
axes[0].set_ylabel("Predicted Achv")
axes[0].set_title("Training Set: Predicted vs Actual")

# Predicted vs Actual for test set
axes[1].scatter(y_test_inv, y_test_pred_inv, color="C1", **scatter_kws)
axes[1].plot(
    [y_test_inv.min(), y_test_inv.max()],
    [y_test_inv.min(), y_test_inv.max()],
    **line_kws,
)
axes[1].set_xlabel("Actual Achv")
axes[1].set_ylabel("Predicted Achv")
axes[1].set_title("Test Set: Predicted vs Actual")

# Residual QQ plot
residuals = y_test - y_test_pred
stats.probplot(residuals, dist="norm", plot=axes[2])
axes[2].get_lines()[0].set_markerfacecolor("C2")
axes[2].get_lines()[0].set_markeredgecolor("white")
axes[2].get_lines()[0].set_alpha(0.6)
axes[2].get_lines()[1].set_color("#333333")
axes[2].get_lines()[1].set_linewidth(2)
axes[2].set_title("QQ Plot of Residuals")

plt.tight_layout()
plt.show()

preprocessor_rf = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_features),
        (
            "cat",
            OneHotEncoder(
                sparse_output=True,
                handle_unknown="error",
                categories=[
                    merged_df[column].unique().tolist()
                    for column in categorical_features
                ],
            ),
            categorical_features,
        ),
    ]
)
pipeline_rf = Pipeline(
    steps=[
        ("preprocessor", preprocessor_rf),
        (
            "rfr",
            RandomForestRegressor(
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)
param_grid = {
    "rfr__max_depth": np.arange(2, 5, 1),
    "rfr__min_samples_leaf": np.arange(2, 50, 1),
    "rfr__ccp_alpha": np.arange(0, 0.04, 0.001),
}

grid_search = RandomizedSearchCV(
    pipeline_rf,
    param_grid,
    scoring="r2",
    n_jobs=-1,
    verbose=10,
    random_state=RANDOM_STATE,
    n_iter=100,
    cv=TimeSeriesSplit(n_splits=10),
)

grid_search.fit(X_train, y_train)
pipeline_rf = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV R^2: {grid_search.best_score_:.4f}")

Fitting 10 folds for each of 100 candidates, totalling 1000 fits

Best parameters: {'rfr__min_samples_leaf': np.int64(4), 'rfr__max_depth': np.int64(4), 'rfr__ccp_alpha': np.float64(0.003)}
Best CV R^2: -0.1283

y_train_pred_rf = pipeline_rf.predict(X_train)
train_r2_rf = r2_score(y_train, y_train_pred_rf)
train_mse_rf = mean_absolute_error(y_train, y_train_pred_rf)
train_mse_rf_transformed = mean_absolute_error(
    Achv_inverse_transform(y_train), Achv_inverse_transform(y_train_pred_rf)
)
print("Train set performance:")
print(f"R^2 Score: {train_r2_rf:.4f}")
print(f"Mean Absolute Error: {train_mse_rf:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {train_mse_rf_transformed:.4f}")

y_test_pred_rf = pipeline_rf.predict(X_test)
test_r2_rf = r2_score(y_test, y_test_pred_rf)
test_mse_rf = mean_absolute_error(y_test, y_test_pred_rf)
test_mse_rf_transformed = mean_absolute_error(
    Achv_inverse_transform(y_test), Achv_inverse_transform(y_test_pred_rf)
)
print("Test set performance:")
print(f"R^2 Score: {test_r2_rf:.4f}")
print(f"Mean Absolute Error: {test_mse_rf:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {test_mse_rf_transformed:.4f}")

Train set performance:
R^2 Score: 0.7257
Mean Absolute Error: 0.4245
Mean Absolute Error (Inverse Transformed): 0.9093
Test set performance:
R^2 Score: 0.4535
Mean Absolute Error: 0.7243
Mean Absolute Error (Inverse Transformed): 1.3905

y_train_inv_rf = Achv_inverse_transform(y_train)
y_train_pred_inv_rf = Achv_inverse_transform(y_train_pred_rf)
y_test_inv_rf = Achv_inverse_transform(y_test)
y_test_pred_inv_rf = Achv_inverse_transform(y_test_pred_rf)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Common scatter kwargs
scatter_kws = {"alpha": 0.6, "s": 40, "edgecolor": "white", "linewidth": 0.5}
line_kws = {"color": "#333333", "linestyle": "--", "linewidth": 2, "alpha": 0.8}

# 1. Predicted vs Actual for training set
axes[0].scatter(y_train_inv_rf, y_train_pred_inv_rf, color="C0", **scatter_kws)
axes[0].plot(
    [y_train_inv_rf.min(), y_train_inv_rf.max()],
    [y_train_inv_rf.min(), y_train_inv_rf.max()],
    **line_kws,
)
axes[0].set_xlabel("Actual Achv")
axes[0].set_ylabel("Predicted Achv")
axes[0].set_title("Training Set: Predicted vs Actual (RF)")

# 2. Predicted vs Actual for test set
axes[1].scatter(y_test_inv_rf, y_test_pred_inv_rf, color="C1", **scatter_kws)
axes[1].plot(
    [y_test_inv_rf.min(), y_test_inv_rf.max()],
    [y_test_inv_rf.min(), y_test_inv_rf.max()],
    **line_kws,
)
axes[1].set_xlabel("Actual Achv")
axes[1].set_ylabel("Predicted Achv")
axes[1].set_title("Test Set: Predicted vs Actual (RF)")

plt.tight_layout()
plt.show()

rf_features_step = pipeline_rf.named_steps["preprocessor"].named_transformers_["num"]
rf_numeric_feature_names = rf_features_step.get_feature_names_out(numerical_features)

cat_encoder_rf = pipeline_rf.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names_rf = cat_encoder_rf.get_feature_names_out(categorical_features)

all_feature_names_rf = list(rf_numeric_feature_names) + list(cat_feature_names_rf)
rf_model = pipeline_rf.named_steps["rfr"]

# Calculate Split Counts (Frequency)
n_features = len(all_feature_names_rf)
split_counts = np.zeros(n_features, dtype=int)

for tree in rf_model.estimators_:
    tree_features = tree.tree_.feature
    # Filter out leaf nodes (indicated by -2)
    indices = tree_features[tree_features >= 0]
    np.add.at(split_counts, indices, 1)

# Create DataFrame with Importance and Splits
importance_df = pd.DataFrame(
    {
        "Feature": all_feature_names_rf,
        "Importance": rf_model.feature_importances_,
        "Splits": split_counts,
    }
).sort_values(by="Importance", ascending=False)

# Filter for visualization
useful_features = importance_df[importance_df["Importance"] > 0].copy()

# Create custom labels: "Feature (Importance | Splits)"
useful_features["Label"] = useful_features.apply(
    lambda x: f"{x['Importance']:.3f} ({int(x['Splits'])})", axis=1
)

display(useful_features)

# Visualization
fig, ax = plt.subplots(figsize=(14, len(useful_features) * 0.4 + 2))

# Create gradient colors based on Importance
norm = plt.Normalize(useful_features["Importance"].min(), useful_features["Importance"].max())
colors = plt.cm.viridis(norm(useful_features["Importance"]))

bars = ax.barh(
    range(len(useful_features)),
    useful_features["Importance"],
    color=colors,
    edgecolor="none",
    height=0.7
)

# Add text labels inside or next to bars
for i, (bar, label) in enumerate(zip(bars, useful_features["Label"])):
    width = bar.get_width()
    ax.text(
        width,
        bar.get_y() + bar.get_height() / 2,
        f" {label}",
        va="center",
        ha="left",
        fontsize=10,
        color="#333333",
        fontweight="medium"
    )

ax.set_yticks(range(len(useful_features)))
ax.set_yticklabels(useful_features["Feature"], fontsize=11)
ax.set_xlabel("Importance (Gain)", fontsize=12)
ax.set_title("Feature Importances (Gain) and Split Counts", fontsize=15, pad=15)
ax.invert_yaxis()

# Remove top and right spines for cleaner look
sns.despine()

plt.tight_layout()
plt.show()

C:\Users\123er\AppData\Local\Temp\ipykernel_7824\3721033785.py:77: UserWarning: Glyph 65286 (\N{FULLWIDTH AMPERSAND}) missing from font(s) Arial.
  plt.tight_layout()
d:\Personal Data\Repositories\personal-repo\maimai-dx-regression\.venv\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 65286 (\N{FULLWIDTH AMPERSAND}) missing from font(s) Arial.
  fig.canvas.print_figure(bytes_io, **kw)

# Show the test set predictions and actual scores
test_results_df = pd.DataFrame(
    {
        "Actual Achv": Achv_inverse_transform(y_test),
        "Predicted Achv (ElasticNet)": Achv_inverse_transform(y_test_pred),
        "Predicted Achv (RandomForest)": Achv_inverse_transform(y_test_pred_rf),
    }
)
display(test_results_df)

	Song	Genre	Version	Chart	Difficulty	Level	Achv	Rank	FC/AP	Sync	DX ✦	DX %	DX Score	Chart Constant
0	躯樹の墓守	maimai	UNiVERSE	DX	BASIC	6	94.23%	AAA	FC	SYNC	0	75.40%	570/756	6
1	系ぎて	maimai	BUDDiES	DX	BASIC	7	96.63%	AAA	-	SYNC	0	74.20%	766/1032	7
2	廻廻奇譚	POPS＆ANIME	UNiVERSE	DX	ADVANCED	7	97.65%	S	-	SYNC	0	79.00%	886/1122	7.5
3	INTERNET YAMERO	POPS＆ANIME	BUDDiES PLUS	DX	ADVANCED	7+	96.34%	AAA	-	SYNC	0	73.20%	854/1167	7.8
4	INTERNET OVERDOSE	POPS＆ANIME	FESTiVAL	DX	ADVANCED	7	96.11%	AAA	-	SYNC	0	76.80%	820/1068	7.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
535	Kiss Me Kiss	オンゲキ＆CHUNITHM	Splash PLUS	DX	MASTER	12+	94.07%	AAA	-	-	0	75.80%	1635/2157	12.8
536	Session High⤴	オンゲキ＆CHUNITHM	MiLK PLUS	STD	MASTER	12+	94.75%	AAA	-	-	0	76.20%	1226/1608	12.8
537	イロトリドリのメロディ	オンゲキ＆CHUNITHM	FiNALE	STD	MASTER	12+	94.80%	AAA	-	-	0	73.40%	1267/1725	12.7
538	Agitation！	オンゲキ＆CHUNITHM	maimaiでらっくす	DX	MASTER	12	96.04%	AAA	-	-	0	73.20%	1461/1995	12
539	Counselor	オンゲキ＆CHUNITHM	PiNK	STD	MASTER	12	97.02%	S	-	SYNC	0	81.90%	1769/2160	12.4

	songId	bpm	releaseDate
0	"411Ψ892"	184.0	2023-02-03
1	#狂った民族２ PRAVARGYAZOOQA	170.0	2022-11-18
2	+♂	180.0	2015-12-09
3	-OutsideR:RequieM-	181.0	2019-03-08
4	1000年生きてる	100.0	2023-09-14
...	...	...	...
1383	ヤミナベ!!!!	264.0	2025-12-24
1384	勝手に生きましょ	176.0	2026-01-09
1385	化けの花	180.0	2026-01-09
1386	胡蝶乃舞	180.0	2026-01-23
1387	零號車輛	240.0	2026-01-23

	songId	Genre	Version	Chart	Difficulty	Level	Achv	Rank	FC/AP	Sync	DX %	DX Score	Chart Constant	bpm	releaseDate
0	躯樹の墓守	maimai	UNiVERSE	DX	BASIC	6	94.23%	AAA	FC	SYNC	75.40%	570/756	6	211.0	2022-02-10
1	系ぎて	maimai	BUDDiES	DX	BASIC	7	96.63%	AAA	-	SYNC	74.20%	766/1032	7	88.0	2024-02-29
2	廻廻奇譚	POPS＆ANIME	UNiVERSE	DX	ADVANCED	7	97.65%	S	-	SYNC	79.00%	886/1122	7.5	185.0	2021-09-16
3	INTERNET YAMERO	POPS＆ANIME	BUDDiES PLUS	DX	ADVANCED	7+	96.34%	AAA	-	SYNC	73.20%	854/1167	7.8	185.0	2024-03-21
4	INTERNET OVERDOSE	POPS＆ANIME	FESTiVAL	DX	ADVANCED	7	96.11%	AAA	-	SYNC	76.80%	820/1068	7.5	163.0	2023-03-23

	songId	Genre	Version	Chart	Difficulty	Level	Achv	Rank	DX Score	Chart Constant	bpm	releaseDate
5	さくゆいたいそう	POPS＆ANIME	BUDDiES	DX	ADVANCED	6+	99.26%	SS	829/1005	6.8	NaN	NaN
78	前前前世	POPS＆ANIME	MURASAKi PLUS	STD	EXPERT	9	97.21%	S	1111/1395	NaN	190.0	1807.0
95	ヒトガタ	POPS＆ANIME	UNiVERSE PLUS	DX	EXPERT	10+	99.78%	SS+	1463/1650	10.8	NaN	NaN
100	さくゆいたいそう	POPS＆ANIME	BUDDiES	DX	EXPERT	9+	99.46%	SS	1422/1614	9.7	NaN	NaN
129	メズマライザー	niconico＆VOCALOID™	PRiSM PLUS (beta)	DX	EXPERT	10	100.65%	SSS+	1425/1572	10.5	NaN	NaN
130	サイエンス	niconico＆VOCALOID™	CiRCLE (experimental)	DX	EXPERT	10	100.78%	SSS+	1468/1596	10.0	188.0	NaN
196	ラグトレイン	niconico＆VOCALOID™	UNiVERSE	DX	EXPERT	9+	96.85%	AAA	1280/1608	9.8	NaN	NaN
224	有頂天ドリーマーズ	東方Project	CiRCLE (experimental)	DX	EXPERT	10+	100.53%	SSS+	1182/1338	10.8	188.0	NaN
233	Ultimate taste	東方Project	PRiSM (beta)	DX	EXPERT	11	99.97%	SS+	1547/1785	11.0	NaN	NaN
367	411Ψ892	maimai	FESTiVAL	DX	EXPERT	12+	99.08%	SS	1568/1893	12.7	NaN	NaN
448	さくゆいたいそう	POPS＆ANIME	BUDDiES	DX	MASTER	12+	99.25%	SS	1763/2043	12.6	NaN	NaN
467	メズマライザー	niconico＆VOCALOID™	PRiSM PLUS (beta)	DX	MASTER	11+	100.08%	SSS	1850/2025	11.9	NaN	NaN
468	サイエンス	niconico＆VOCALOID™	CiRCLE (experimental)	DX	MASTER	13	98.35%	S+	1875/2250	13.0	188.0	NaN
495	エイリアンエイリアン	niconico＆VOCALOID™	MURASAKi PLUS	STD	MASTER	12+	96.18%	AAA	1775/2268	12.8	NaN	NaN
508	有頂天ドリーマーズ	東方Project	CiRCLE (experimental)	DX	MASTER	13	92.22%	AA	1415/1935	13.1	188.0	NaN

	songId	Genre	Version	Chart	Difficulty	Level	Achv	Rank	DX Score	Chart Constant	bpm	releaseDate
0	躯樹の墓守	maimai	UNiVERSE	DX	BASIC	6	94.23	AAA	756	6.0	211.0	3501.0
1	系ぎて	maimai	BUDDiES	DX	BASIC	7	96.63	AAA	1032	7.0	88.0	4250.0
2	廻廻奇譚	POPS＆ANIME	UNiVERSE	DX	ADVANCED	7	97.65	S	1122	7.5	185.0	3354.0
3	INTERNET YAMERO	POPS＆ANIME	BUDDiES PLUS	DX	ADVANCED	7+	96.34	AAA	1167	7.8	185.0	4271.0
4	INTERNET OVERDOSE	POPS＆ANIME	FESTiVAL	DX	ADVANCED	7	96.11	AAA	1068	7.5	163.0	3907.0

	songId	Genre	Version	Chart	Difficulty	Level	Achv	Rank	DX Score	Chart Constant	bpm	releaseDate
186	グリーンライツ・セレナーデ	niconico＆VOCALOID™	maimaiでらっくす	DX	EXPERT	10	66.53	B	1839	10.3	200.0	2598.0
141	バグ	niconico＆VOCALOID™	BUDDiES	DX	EXPERT	10	69.89	B	1854	10.2	186.0	4082.0
342	炎歌 -ほむらうた-	maimai	BUDDiES PLUS	DX	EXPERT	9+	81.31	A	1017	9.8	108.0	286.0
41	FEEL the BEATS	maimai	PiNK	STD	ADVANCED	8+	83.66	A	1485	8.6	166.0	1380.0
383	Fragrance	maimai	maimai PLUS	STD	EXPERT	12+	86.69	A	882	12.7	180.0	246.0
144	セカイはまだ始まってすらいない	niconico＆VOCALOID™	FESTiVAL	DX	EXPERT	8+	88.27	A	1134	8.7	138.0	3383.0
386	ViRTUS	maimai	FESTiVAL	DX	EXPERT	13	88.97	A	2139	13.2	225.0	3886.0
449	はんぶんこ	POPS＆ANIME	FESTiVAL PLUS	DX	MASTER	12+	89.61	A	1131	12.9	124.0	4027.0
446	INTERNET OVERDOSE	POPS＆ANIME	FESTiVAL	DX	MASTER	13	89.85	A	2466	13.5	163.0	3907.0
218	ロストワンの号哭	niconico＆VOCALOID™	ORANGE PLUS	STD	EXPERT	10+	89.98	A	1635	10.8	162.0	981.0

	songId	type	difficulty	noteCounts.tap	noteCounts.hold	noteCounts.slide	noteCounts.touch	noteCounts.break	noteCounts.total	noteDesigner
0	"411Ψ892"	dx	basic	179	15	4	21.0	8	227	-
1	"411Ψ892"	dx	advanced	298	31	12	31.0	10	382	-
2	"411Ψ892"	dx	expert	416	53	52	25.0	85	631	ロシェ@ペンギン
3	"411Ψ892"	dx	master	664	53	86	39.0	103	945	サファ太 vs じゃこレモン
4	#狂った民族２ PRAVARGYAZOOQA	dx	basic	172	10	10	8.0	44	244	-

	Name	Version	Difficulty	Level	Achievement	Play Count	Last Played
0	躯樹の墓守	DX	basic	6	94.2324%	1	2025/01/16 00:25
1	系ぎて	DX	basic	7	96.6252%	1	2024/03/29 15:57
2	系ぎて	DX	advanced	10	94.0313%	1	2025/05/27 17:51
3	廻廻奇譚	DX	advanced	7	97.6466%	1	2024/07/03 15:38
4	廻廻奇譚	DX	expert	9+	93.9536%	1	2025/04/12 00:19

	songId	Genre	Version	type	difficulty	Level	Achv	Rank	DX Score	Chart Constant	bpm	releaseDate	Name	Play Count	Last Played	Last Played_Year	Last Played_Month	Last Played_Day	noteCounts.tap	noteCounts.hold	noteCounts.slide	noteCounts.touch	noteCounts.break	noteCounts.total	noteDesigner	songId_length	songId_Japanese
0	躯樹の墓守	maimai	18.0	1	basic	6	94.23	AAA	756	6.0	211.0	3501.0	躯樹の墓守	1.0	1.736984e+18	2025.0	1.0	15.0	214.0	16.0	6.0	10.0	6.0	252.0	-	5	5
1	系ぎて	maimai	22.0	1	basic	7	96.63	AAA	1032	7.0	88.0	4250.0	系ぎて	1.0	1.711724e+18	2024.0	3.0	29.0	282.0	26.0	8.0	14.0	14.0	344.0	-	3	3
2	廻廻奇譚	POPS＆ANIME	18.0	1	advanced	7	97.65	S	1122	7.5	185.0	3354.0	廻廻奇譚	1.0	1.720017e+18	2024.0	7.0	3.0	301.0	26.0	19.0	22.0	6.0	374.0	-	4	4

	songId	Genre	Version	type	difficulty	Level	Achv	Rank	DX Score	Chart Constant	bpm	releaseDate	Name	Play Count	Last Played	Last Played_Year	Last Played_Month	Last Played_Day	noteCounts.tap	noteCounts.hold	noteCounts.slide	noteCounts.touch	noteCounts.break	noteCounts.total	noteDesigner	songId_length	songId_English	songId_Japanese	songId_Punctuation
count	536	536	521.000000	536.000000	536.000000	536	536.000000	536	536.000000	536.000000	526.000000	522.000000	383	383.000000	3.830000e+02	383.000000	383.000000	383.000000	536.000000	536.000000	536.000000	536.000000	536.000000	536.000000	527	536.000000	536.000000	536.000000	536.000000
unique	440	6	NaN	NaN	NaN	15	NaN	9	NaN	NaN	NaN	NaN	319	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	45	NaN	NaN	NaN	NaN
top	INTERNET OVERDOSE	niconico＆VOCALOID™	NaN	NaN	NaN	12	NaN	S+	NaN	NaN	NaN	NaN	INTERNET OVERDOSE	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	はっぴー	NaN	NaN	NaN	NaN
freq	3	146	NaN	NaN	NaN	78	NaN	92	NaN	NaN	NaN	NaN	3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	79	NaN	NaN	NaN	NaN
mean	NaN	NaN	17.366603	0.720149	2.055970	NaN	98.289291	NaN	1598.423507	10.935448	174.192015	3219.281609	NaN	2.561358	1.757252e+18	2025.208877	6.156658	17.389034	376.194030	51.875000	44.673507	22.535448	27.848881	532.807836	NaN	10.867537	5.227612	4.604478	0.514925
std	NaN	NaN	6.431739	0.449345	0.580559	NaN	2.513036	NaN	390.847516	1.465903	33.361900	1288.210058	NaN	2.808135	1.083736e+16	0.478024	3.780842	9.088142	108.806712	26.458312	30.729080	23.948985	22.903485	130.282505	NaN	8.753274	8.251902	4.833711	1.113956
min	NaN	NaN	1.000000	0.000000	0.000000	NaN	86.690000	NaN	426.000000	6.000000	70.000000	0.000000	NaN	1.000000	1.711724e+18	2024.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	142.000000	NaN	0.000000	0.000000	0.000000	0.000000
25%	NaN	NaN	13.000000	0.000000	2.000000	NaN	97.130000	NaN	1329.000000	10.000000	150.000000	2346.000000	NaN	1.000000	1.752878e+18	2025.000000	1.000000	9.000000	311.000000	34.000000	22.000000	0.000000	11.000000	443.000000	NaN	6.000000	0.000000	0.000000	0.000000
50%	NaN	NaN	19.000000	1.000000	2.000000	NaN	99.025000	NaN	1590.000000	11.000000	176.000000	3543.000000	NaN	1.000000	1.756314e+18	2025.000000	7.000000	18.000000	378.000000	49.000000	40.000000	17.000000	22.000000	530.000000	NaN	9.000000	0.000000	4.000000	0.000000
75%	NaN	NaN	23.000000	1.000000	2.000000	NaN	100.185000	NaN	1878.000000	12.000000	190.000000	4271.000000	NaN	3.000000	1.767130e+18	2025.000000	8.000000	26.000000	450.250000	69.000000	59.250000	34.250000	37.000000	626.000000	NaN	14.000000	10.000000	8.000000	1.000000
max	NaN	NaN	25.000000	1.000000	3.000000	NaN	101.000000	NaN	2958.000000	13.500000	339.000000	4944.000000	NaN	22.000000	1.769882e+18	2026.000000	12.000000	31.000000	726.000000	184.000000	185.000000	153.000000	180.000000	986.000000	NaN	149.000000	120.000000	28.000000	7.000000

	Feature	Coefficient
3	Genre_東方Project	-0.571551
7	type	0.480025
9	Chart Constant	0.320098
6	Version	0.169017
0	Genre_niconico＆VOCALOID™	0.102858
19	noteCounts.slide	0.019177
4	Genre_GAME＆VARIETY	0.003423

	Feature	Importance	Splits	Label
7	Last Played	0.340450	175	0.340 (175)
3	Chart Constant	0.294145	209	0.294 (209)
6	Play Count	0.062840	63	0.063 (63)
13	noteCounts.slide	0.046182	84	0.046 (84)
9	Last Played_Month	0.033795	23	0.034 (23)
12	noteCounts.hold	0.031515	53	0.032 (53)
16	noteCounts.total	0.028551	35	0.029 (35)
0	Version	0.025488	31	0.025 (31)
15	noteCounts.break	0.020023	52	0.020 (52)
4	bpm	0.019168	59	0.019 (59)
5	releaseDate	0.018831	53	0.019 (53)
11	noteCounts.tap	0.015146	26	0.015 (26)
19	songId_Japanese	0.013148	38	0.013 (38)
2	difficulty	0.012306	15	0.012 (15)
14	noteCounts.touch	0.009308	33	0.009 (33)
18	songId_English	0.006773	12	0.007 (12)
20	songId_Punctuation	0.006287	10	0.006 (10)
10	Last Played_Day	0.005070	21	0.005 (21)
23	Genre_POPS＆ANIME	0.004546	16	0.005 (16)
17	songId_length	0.003153	10	0.003 (10)
21	Genre_niconico＆VOCALOID™	0.002195	8	0.002 (8)
25	Genre_GAME＆VARIETY	0.000928	2	0.001 (2)
22	Genre_maimai	0.000154	1	0.000 (1)

	Actual Achv	Predicted Achv (ElasticNet)	Predicted Achv (RandomForest)
0	98.36	98.032286	96.445536
1	100.30	98.757396	99.157519
2	98.59	98.408814	98.791806
3	98.19	98.552006	98.951504
4	100.57	99.991425	99.726866
...	...	...	...
175	100.71	99.864643	99.722681
176	94.54	96.308191	96.253539
177	96.17	97.680423	97.167720
178	95.30	96.507082	96.282976
179	94.37	96.097244	96.287665

Predicting maimai DX player map achievement scores¶