Predicting maimai DX player map achievement scores¶

In [1]:
RANDOM_STATE = 42
In [2]:
%matplotlib inline
from pathlib import Path

import numpy as np
import pandas as pd
import rich
import scipy
import scipy.stats as stats
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import TimeSeriesSplit
In [3]:
# Set a global aesthetic theme
sns.set_theme(style="whitegrid", context="notebook", palette="muted")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12
pd.set_option("display.max_columns", None)
In [4]:
# Loading the data
maimai_path = Path("maimai.csv")
bpm_path = Path("bpm.csv")

maimai_df = pd.read_csv(maimai_path)
bpm_df = pd.read_csv(bpm_path)
In [5]:
maimai_df
Out[5]:
Song Genre Version Chart Difficulty Level Achv Rank FC/AP Sync DX ✦ DX % DX Score Chart Constant
0 躯樹の墓守 maimai UNiVERSE DX BASIC 6 94.23% AAA FC SYNC 0 75.40% 570/756 6
1 系ぎて maimai BUDDiES DX BASIC 7 96.63% AAA - SYNC 0 74.20% 766/1032 7
2 廻廻奇譚 POPS&ANIME UNiVERSE DX ADVANCED 7 97.65% S - SYNC 0 79.00% 886/1122 7.5
3 INTERNET YAMERO POPS&ANIME BUDDiES PLUS DX ADVANCED 7+ 96.34% AAA - SYNC 0 73.20% 854/1167 7.8
4 INTERNET OVERDOSE POPS&ANIME FESTiVAL DX ADVANCED 7 96.11% AAA - SYNC 0 76.80% 820/1068 7.5
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
535 Kiss Me Kiss オンゲキ&CHUNITHM Splash PLUS DX MASTER 12+ 94.07% AAA - - 0 75.80% 1635/2157 12.8
536 Session High⤴ オンゲキ&CHUNITHM MiLK PLUS STD MASTER 12+ 94.75% AAA - - 0 76.20% 1226/1608 12.8
537 イロトリドリのメロディ オンゲキ&CHUNITHM FiNALE STD MASTER 12+ 94.80% AAA - - 0 73.40% 1267/1725 12.7
538 Agitation! オンゲキ&CHUNITHM maimaiでらっくす DX MASTER 12 96.04% AAA - - 0 73.20% 1461/1995 12
539 Counselor オンゲキ&CHUNITHM PiNK STD MASTER 12 97.02% S - SYNC 0 81.90% 1769/2160 12.4

540 rows × 14 columns

In [6]:
bpm_df
Out[6]:
songId bpm releaseDate
0 "411Ψ892" 184.0 2023-02-03
1 #狂った民族2 PRAVARGYAZOOQA 170.0 2022-11-18
2 +♂ 180.0 2015-12-09
3 -OutsideR:RequieM- 181.0 2019-03-08
4 1000年生きてる 100.0 2023-09-14
... ... ... ...
1383 ヤミナベ!!!! 264.0 2025-12-24
1384 勝手に生きましょ 176.0 2026-01-09
1385 化けの花 180.0 2026-01-09
1386 胡蝶乃舞 180.0 2026-01-23
1387 零號車輛 240.0 2026-01-23

1388 rows × 3 columns

In [7]:
# Merging datasets
maimai_df.rename(columns={"Song": "songId"}, inplace=True)
merged_df = maimai_df.merge(bpm_df, on="songId", how="left")
print(f"Duplicated: {merged_df.duplicated().sum()}")
merged_df.head()
Duplicated: 0
Out[7]:
songId Genre Version Chart Difficulty Level Achv Rank FC/AP Sync DX ✦ DX % DX Score Chart Constant bpm releaseDate
0 躯樹の墓守 maimai UNiVERSE DX BASIC 6 94.23% AAA FC SYNC 0 75.40% 570/756 6 211.0 2022-02-10
1 系ぎて maimai BUDDiES DX BASIC 7 96.63% AAA - SYNC 0 74.20% 766/1032 7 88.0 2024-02-29
2 廻廻奇譚 POPS&ANIME UNiVERSE DX ADVANCED 7 97.65% S - SYNC 0 79.00% 886/1122 7.5 185.0 2021-09-16
3 INTERNET YAMERO POPS&ANIME BUDDiES PLUS DX ADVANCED 7+ 96.34% AAA - SYNC 0 73.20% 854/1167 7.8 185.0 2024-03-21
4 INTERNET OVERDOSE POPS&ANIME FESTiVAL DX ADVANCED 7 96.11% AAA - SYNC 0 76.80% 820/1068 7.5 163.0 2023-03-23
In [8]:
# Clean up unnecessary columns
merged_df.drop(columns=["FC/AP", "Sync", "DX ✦", "DX %"], inplace=True)

# Expose missing values
merged_df["Chart Constant"] = pd.to_numeric(
    merged_df["Chart Constant"], errors="coerce"
)
In [9]:
# convert releaseDate to numerical values
merged_df["releaseDate"] = pd.to_datetime(merged_df["releaseDate"], errors="coerce")
merged_df["releaseDate"] = (
    merged_df["releaseDate"] - merged_df["releaseDate"].min()
).dt.days
merged_df.head(3)
Out[9]:
songId Genre Version Chart Difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate
0 躯樹の墓守 maimai UNiVERSE DX BASIC 6 94.23% AAA 570/756 6.0 211.0 3501.0
1 系ぎて maimai BUDDiES DX BASIC 7 96.63% AAA 766/1032 7.0 88.0 4250.0
2 廻廻奇譚 POPS&ANIME UNiVERSE DX ADVANCED 7 97.65% S 886/1122 7.5 185.0 3354.0
In [10]:
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
merged_df[merged_df.isna().any(axis=1)]
Total rows with missing values: 15
Rows with missing values:
Out[10]:
songId Genre Version Chart Difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate
5 さくゆいたいそう POPS&ANIME BUDDiES DX ADVANCED 6+ 99.26% SS 829/1005 6.8 NaN NaN
78 前前前世 POPS&ANIME MURASAKi PLUS STD EXPERT 9 97.21% S 1111/1395 NaN 190.0 1807.0
95 ヒトガタ POPS&ANIME UNiVERSE PLUS DX EXPERT 10+ 99.78% SS+ 1463/1650 10.8 NaN NaN
100 さくゆいたいそう POPS&ANIME BUDDiES DX EXPERT 9+ 99.46% SS 1422/1614 9.7 NaN NaN
129 メズマライザー niconico&VOCALOID™ PRiSM PLUS (beta) DX EXPERT 10 100.65% SSS+ 1425/1572 10.5 NaN NaN
130 サイエンス niconico&VOCALOID™ CiRCLE (experimental) DX EXPERT 10 100.78% SSS+ 1468/1596 10.0 188.0 NaN
196 ラグトレイン niconico&VOCALOID™ UNiVERSE DX EXPERT 9+ 96.85% AAA 1280/1608 9.8 NaN NaN
224 有頂天ドリーマーズ 東方Project CiRCLE (experimental) DX EXPERT 10+ 100.53% SSS+ 1182/1338 10.8 188.0 NaN
233 Ultimate taste 東方Project PRiSM (beta) DX EXPERT 11 99.97% SS+ 1547/1785 11.0 NaN NaN
367 411Ψ892 maimai FESTiVAL DX EXPERT 12+ 99.08% SS 1568/1893 12.7 NaN NaN
448 さくゆいたいそう POPS&ANIME BUDDiES DX MASTER 12+ 99.25% SS 1763/2043 12.6 NaN NaN
467 メズマライザー niconico&VOCALOID™ PRiSM PLUS (beta) DX MASTER 11+ 100.08% SSS 1850/2025 11.9 NaN NaN
468 サイエンス niconico&VOCALOID™ CiRCLE (experimental) DX MASTER 13 98.35% S+ 1875/2250 13.0 188.0 NaN
495 エイリアンエイリアン niconico&VOCALOID™ MURASAKi PLUS STD MASTER 12+ 96.18% AAA 1775/2268 12.8 NaN NaN
508 有頂天ドリーマーズ 東方Project CiRCLE (experimental) DX MASTER 13 92.22% AA 1415/1935 13.1 188.0 NaN
In [11]:
# Fill missing Chart Constant values based on Level
level_mapping = merged_df.groupby("Level")["Chart Constant"].median()
print(level_mapping)

merged_df.fillna(
    {"Chart Constant": merged_df["Level"].map(level_mapping)}, inplace=True
)

print(
    f"\nRemaining missing Chart Constant values: {merged_df['Chart Constant'].isna().sum()}"
)
merged_df[["Level", "Chart Constant"]].head(10)
Level
10     10.20
10+    10.70
11     11.20
11+    11.80
12     12.35
12+    12.80
13     13.00
6       6.00
6+      6.80
7       7.20
7+      7.80
8       8.20
8+      8.70
9       9.30
9+      9.80
Name: Chart Constant, dtype: float64

Remaining missing Chart Constant values: 0
Out[11]:
Level Chart Constant
0 6 6.0
1 7 7.0
2 7 7.5
3 7+ 7.8
4 7 7.5
5 6+ 6.8
6 7+ 7.7
7 7 7.5
8 7 7.0
9 7+ 7.9
In [12]:
# Simply map Level "6" -> 6.2, "6+" -> 6.8 for remaining missing values


def map_level_to_constant(level):
    if isinstance(level, str) and level.endswith("+"):
        return float(level[:-1]) + 0.8
    else:
        try:
            return float(level) + 0.2
        except ValueError:
            return None


merged_df["Chart Constant"] = merged_df.apply(
    lambda row: (
        map_level_to_constant(row["Level"])
        if pd.isna(row["Chart Constant"])
        else row["Chart Constant"]
    ),
    axis=1,
)

print(
    f"\nRemaining missing Chart Constant values: {merged_df['Chart Constant'].isna().sum()}"
)
merged_df[["Level", "Chart Constant"]].head(10)
Remaining missing Chart Constant values: 0
Out[12]:
Level Chart Constant
0 6 6.0
1 7 7.0
2 7 7.5
3 7+ 7.8
4 7 7.5
5 6+ 6.8
6 7+ 7.7
7 7 7.5
8 7 7.0
9 7+ 7.9
In [13]:
# Drop easy charts
# merged_df = merged_df[merged_df["Chart Constant"] >= 10.5]
In [14]:
merged_df["DX Score"] = (
    merged_df["DX Score"].astype(str).str.split("/").str[1].astype(int)
)
merged_df.head(3)
Out[14]:
songId Genre Version Chart Difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate
0 躯樹の墓守 maimai UNiVERSE DX BASIC 6 94.23% AAA 756 6.0 211.0 3501.0
1 系ぎて maimai BUDDiES DX BASIC 7 96.63% AAA 1032 7.0 88.0 4250.0
2 廻廻奇譚 POPS&ANIME UNiVERSE DX ADVANCED 7 97.65% S 1122 7.5 185.0 3354.0
In [15]:
# Remove the % suffix from the "Achv" column and convert it to float
merged_df["Achv"] = merged_df["Achv"].str.rstrip("%").astype(float)
merged_df.head(5)
Out[15]:
songId Genre Version Chart Difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate
0 躯樹の墓守 maimai UNiVERSE DX BASIC 6 94.23 AAA 756 6.0 211.0 3501.0
1 系ぎて maimai BUDDiES DX BASIC 7 96.63 AAA 1032 7.0 88.0 4250.0
2 廻廻奇譚 POPS&ANIME UNiVERSE DX ADVANCED 7 97.65 S 1122 7.5 185.0 3354.0
3 INTERNET YAMERO POPS&ANIME BUDDiES PLUS DX ADVANCED 7+ 96.34 AAA 1167 7.8 185.0 4271.0
4 INTERNET OVERDOSE POPS&ANIME FESTiVAL DX ADVANCED 7 96.11 AAA 1068 7.5 163.0 3907.0
In [16]:
# display lowest Achv
merged_df.nsmallest(10, "Achv")
Out[16]:
songId Genre Version Chart Difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate
186 グリーンライツ・セレナーデ niconico&VOCALOID™ maimaiでらっくす DX EXPERT 10 66.53 B 1839 10.3 200.0 2598.0
141 バグ niconico&VOCALOID™ BUDDiES DX EXPERT 10 69.89 B 1854 10.2 186.0 4082.0
342 炎歌 -ほむらうた- maimai BUDDiES PLUS DX EXPERT 9+ 81.31 A 1017 9.8 108.0 286.0
41 FEEL the BEATS maimai PiNK STD ADVANCED 8+ 83.66 A 1485 8.6 166.0 1380.0
383 Fragrance maimai maimai PLUS STD EXPERT 12+ 86.69 A 882 12.7 180.0 246.0
144 セカイはまだ始まってすらいない niconico&VOCALOID™ FESTiVAL DX EXPERT 8+ 88.27 A 1134 8.7 138.0 3383.0
386 ViRTUS maimai FESTiVAL DX EXPERT 13 88.97 A 2139 13.2 225.0 3886.0
449 はんぶんこ POPS&ANIME FESTiVAL PLUS DX MASTER 12+ 89.61 A 1131 12.9 124.0 4027.0
446 INTERNET OVERDOSE POPS&ANIME FESTiVAL DX MASTER 13 89.85 A 2466 13.5 163.0 3907.0
218 ロストワンの号哭 niconico&VOCALOID™ ORANGE PLUS STD EXPERT 10+ 89.98 A 1635 10.8 162.0 981.0
In [17]:
# drop Achv outliners
merged_df = merged_df[(merged_df["Achv"] >= 85)]
In [18]:
# song map metrics
songs_df = Path("songs.csv")
songs_df = pd.read_csv(songs_df)
songs_df.head()
Out[18]:
songId type difficulty noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner
0 "411Ψ892" dx basic 179 15 4 21.0 8 227 -
1 "411Ψ892" dx advanced 298 31 12 31.0 10 382 -
2 "411Ψ892" dx expert 416 53 52 25.0 85 631 ロシェ@ペンギン
3 "411Ψ892" dx master 664 53 86 39.0 103 945 サファ太 vs じゃこレモン
4 #狂った民族2 PRAVARGYAZOOQA dx basic 172 10 10 8.0 44 244 -
In [19]:
# player-map relation metrics
playcount_df = Path("playcount.csv")
playcount_df = pd.read_csv(playcount_df)
playcount_df.head()
Out[19]:
Name Version Difficulty Level Achievement Play Count Last Played
0 躯樹の墓守 DX basic 6 94.2324% 1 2025/01/16 00:25
1 系ぎて DX basic 7 96.6252% 1 2024/03/29 15:57
2 系ぎて DX advanced 10 94.0313% 1 2025/05/27 17:51
3 廻廻奇譚 DX advanced 7 97.6466% 1 2024/07/03 15:38
4 廻廻奇譚 DX expert 9+ 93.9536% 1 2025/04/12 00:19
In [20]:
# Shift last played by 1 hour early due to Japan timezone
playcount_df["Last Played"] = pd.to_datetime(playcount_df["Last Played"]) - pd.Timedelta(hours=1)
# Feature engineer Year/Month/Daytime from Last Played
playcount_df["Last Played_Year"] = pd.to_datetime(playcount_df["Last Played"]).dt.year
playcount_df["Last Played_Month"] = pd.to_datetime(playcount_df["Last Played"]).dt.month
playcount_df["Last Played_Day"] = pd.to_datetime(playcount_df["Last Played"]).dt.day
# playcount_df["Last Played_Daytime"] = pd.to_datetime(playcount_df["Last Played"]).dt.hour
# playcount_df["Last Played_Daytime"] = pd.cut(
#     playcount_df["Last Played_Daytime"],
#     bins=[-1, 12, 18, 24],
#     labels=["Morning", "Afternoon", "Night"],
#     right=True,
# )
playcount_df["Last Played"] = pd.to_numeric(playcount_df["Last Played"])
playcount_df.head()
Out[20]:
Name Version Difficulty Level Achievement Play Count Last Played Last Played_Year Last Played_Month Last Played_Day
0 躯樹の墓守 DX basic 6 94.2324% 1 1736983500000000000 2025 1 15
1 系ぎて DX basic 7 96.6252% 1 1711724220000000000 2024 3 29
2 系ぎて DX advanced 10 94.0313% 1 1748364660000000000 2025 5 27
3 廻廻奇譚 DX advanced 7 97.6466% 1 1720017480000000000 2024 7 3
4 廻廻奇譚 DX expert 9+ 93.9536% 1 1744413540000000000 2025 4 11
In [21]:
# Match columns with merged_df
playcount_df["Difficulty"] = playcount_df["Difficulty"].str.upper()
playcount_df.rename(columns={"Version": "Chart"}, inplace=True)
# Drop redundancy columns
playcount_df.drop(columns=["Achievement", "Level"], inplace=True)
In [22]:
# Merge w.r.t. Song name & Version (DX/STD) & Difficulty (basic/advanced/expert/master)
merged_df = merged_df.merge(
    playcount_df,
    left_on=["songId", "Chart", "Difficulty"],
    right_on=["Name", "Chart", "Difficulty"],
    how="left",
)
merged_df.head()
Out[22]:
songId Genre Version Chart Difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day
0 躯樹の墓守 maimai UNiVERSE DX BASIC 6 94.23 AAA 756 6.0 211.0 3501.0 躯樹の墓守 1.0 1.736984e+18 2025.0 1.0 15.0
1 系ぎて maimai BUDDiES DX BASIC 7 96.63 AAA 1032 7.0 88.0 4250.0 系ぎて 1.0 1.711724e+18 2024.0 3.0 29.0
2 廻廻奇譚 POPS&ANIME UNiVERSE DX ADVANCED 7 97.65 S 1122 7.5 185.0 3354.0 廻廻奇譚 1.0 1.720017e+18 2024.0 7.0 3.0
3 INTERNET YAMERO POPS&ANIME BUDDiES PLUS DX ADVANCED 7+ 96.34 AAA 1167 7.8 185.0 4271.0 INTERNET YAMERO 1.0 1.724088e+18 2024.0 8.0 19.0
4 INTERNET OVERDOSE POPS&ANIME FESTiVAL DX ADVANCED 7 96.11 AAA 1068 7.5 163.0 3907.0 INTERNET OVERDOSE 1.0 1.736984e+18 2025.0 1.0 15.0
In [23]:
songs_df["songId"] = songs_df["songId"].str.replace('"', "")

merged_df.rename(columns={"Chart": "type", "Difficulty": "difficulty"}, inplace=True)
merged_df["type"] = merged_df["type"].str.lower()
merged_df["difficulty"] = merged_df["difficulty"].str.lower()

merged_df = merged_df.merge(
    songs_df, on=["songId", "type", "difficulty"], how="left", copy=False
)
In [24]:
# Check new dataset for missing values
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
merged_df[merged_df.isna().any(axis=1)]
Total rows with missing values: 165
Rows with missing values:
Out[24]:
songId Genre Version type difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner
5 さくゆいたいそう POPS&ANIME BUDDiES dx advanced 6+ 99.26 SS 1005 6.8 NaN NaN さくゆいたいそう 1.0 1.740179e+18 2025.0 2.0 21.0 NaN NaN NaN NaN NaN NaN NaN
8 ミラクル・ショッピング POPS&ANIME MiLK std advanced 7 98.15 S+ 1023 7.0 167.0 2050.0 NaN NaN NaN NaN NaN NaN 272.0 20.0 33.0 NaN 16.0 341.0 -
12 ヒバナ niconico&VOCALOID™ FiNALE std advanced 8 98.13 S+ 1215 8.2 200.0 2346.0 NaN NaN NaN NaN NaN NaN 304.0 31.0 39.0 NaN 31.0 405.0 -
16 ロキ niconico&VOCALOID™ FiNALE std advanced 8 98.97 S+ 1101 8.0 150.0 2346.0 NaN NaN NaN NaN NaN NaN 313.0 22.0 10.0 NaN 22.0 367.0 -
17 パーフェクト生命 niconico&VOCALOID™ PiNK PLUS std advanced 8 100.21 SSS 1389 8.1 200.0 1464.0 NaN NaN NaN NaN NaN NaN 424.0 24.0 5.0 NaN 10.0 463.0 -
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
525 Sweets×Sweets maimai maimai std master 11 96.53 AAA 1161 11.4 132.0 0.0 NaN NaN NaN NaN NaN NaN 343.0 23.0 12.0 NaN 9.0 387.0 maimai TEAM
526 ネコ日和。 maimai maimai std master 12+ 98.72 S+ 1038 12.6 171.0 0.0 NaN NaN NaN NaN NaN NaN 283.0 36.0 11.0 NaN 16.0 346.0 ニャイン
532 Session High⤴ オンゲキ&CHUNITHM MiLK PLUS std master 12+ 94.75 AAA 1608 12.8 200.0 2171.0 NaN NaN NaN NaN NaN NaN 390.0 20.0 79.0 NaN 47.0 536.0 すきやき奉行
533 イロトリドリのメロディ オンゲキ&CHUNITHM FiNALE std master 12+ 94.80 AAA 1725 12.7 140.0 2431.0 NaN NaN NaN NaN NaN NaN 483.0 27.0 42.0 NaN 23.0 575.0 華火職人
535 Counselor オンゲキ&CHUNITHM PiNK std master 12 97.02 S 2160 12.4 135.0 1246.0 NaN NaN NaN NaN NaN NaN 526.0 69.0 88.0 NaN 37.0 720.0 Techno Kitchen

165 rows × 25 columns

In [25]:
note_count_columns = [
    "noteCounts.tap",
    "noteCounts.hold",
    "noteCounts.slide",
    "noteCounts.touch",
    "noteCounts.break",
]

for col in note_count_columns:
    if col in merged_df.columns:
        merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")

merged_df[note_count_columns] = merged_df[note_count_columns].fillna(0)

total_notes = merged_df[note_count_columns].sum(axis=1).replace(0, 1)
merged_df["noteCounts.total"] = merged_df["DX Score"] / 3

merged_df[note_count_columns] = (
    merged_df[note_count_columns]
    .multiply(merged_df["noteCounts.total"] / total_notes, axis=0)
    .round()
)

print(
    f"\nRemaining missing noteCounts values:\n{merged_df[note_count_columns].isna().sum()}"
)
merged_df[merged_df[note_count_columns].isna().any(axis=1)]
Remaining missing noteCounts values:
noteCounts.tap      0
noteCounts.hold     0
noteCounts.slide    0
noteCounts.touch    0
noteCounts.break    0
dtype: int64
Out[25]:
songId Genre Version type difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner
In [26]:
# Check new dataset for missing values
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
# Hide cols with no missing values
merged_df[merged_df.isna().any(axis=1)].head()
Total rows with missing values: 165
Rows with missing values:
Out[26]:
songId Genre Version type difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner
5 さくゆいたいそう POPS&ANIME BUDDiES dx advanced 6+ 99.26 SS 1005 6.8 NaN NaN さくゆいたいそう 1.0 1.740179e+18 2025.0 2.0 21.0 0.0 0.0 0.0 0.0 0.0 335.0 NaN
8 ミラクル・ショッピング POPS&ANIME MiLK std advanced 7 98.15 S+ 1023 7.0 167.0 2050.0 NaN NaN NaN NaN NaN NaN 272.0 20.0 33.0 0.0 16.0 341.0 -
12 ヒバナ niconico&VOCALOID™ FiNALE std advanced 8 98.13 S+ 1215 8.2 200.0 2346.0 NaN NaN NaN NaN NaN NaN 304.0 31.0 39.0 0.0 31.0 405.0 -
16 ロキ niconico&VOCALOID™ FiNALE std advanced 8 98.97 S+ 1101 8.0 150.0 2346.0 NaN NaN NaN NaN NaN NaN 313.0 22.0 10.0 0.0 22.0 367.0 -
17 パーフェクト生命 niconico&VOCALOID™ PiNK PLUS std advanced 8 100.21 SSS 1389 8.1 200.0 1464.0 NaN NaN NaN NaN NaN NaN 424.0 24.0 5.0 0.0 10.0 463.0 -
In [27]:
# SongId specific feature engineering
# * Length of songId
# * language classifier (English/Japanese)
# * HasPunctuation
merged_df["songId_length"] = merged_df["songId"].str.strip().str.len()
merged_df["songId_English"] = (
    merged_df["songId"].str.count(r"[A-Za-z0-9]").fillna(0).astype(int)
)
merged_df["songId_Japanese"] = (
    merged_df["songId"]
    .str.count(r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\uFF65-\uFF9F]")
    .fillna(0)
    .astype(int)
)
merged_df["songId_Punctuation"] = (
    merged_df["songId"].str.count(r"[^\w\s]").fillna(0).astype(int)
)
merged_df.columns
Out[27]:
Index(['songId', 'Genre', 'Version', 'type', 'difficulty', 'Level', 'Achv',
       'Rank', 'DX Score', 'Chart Constant', 'bpm', 'releaseDate', 'Name',
       'Play Count', 'Last Played', 'Last Played_Year', 'Last Played_Month',
       'Last Played_Day', 'noteCounts.tap', 'noteCounts.hold',
       'noteCounts.slide', 'noteCounts.touch', 'noteCounts.break',
       'noteCounts.total', 'noteDesigner', 'songId_length', 'songId_English',
       'songId_Japanese', 'songId_Punctuation'],
      dtype='object')
In [28]:
# Unique versions
print(merged_df["Version"].unique())
# Sort versions properly
versions = [
    "maimai",
    "maimai PLUS",
    "GreeN",
    "GreeN PLUS",
    "ORANGE",
    "ORANGE PLUS",
    "PiNK",
    "PiNK PLUS",
    "MURASAKi",
    "MURASAKi PLUS",
    "MiLK",
    "MiLK PLUS",
    "FiNALE",
    "maimaiでらっくす",
    "maimaiでらっくす PLUS",
    "Splash",
    "Splash PLUS",
    "UNiVERSE",
    "UNiVERSE PLUS",
    "FESTiVAL",
    "FESTiVAL PLUS",
    "BUDDiES",
    "BUDDiES PLUS",
    "PRiSM (beta)",
    "PRiSM PLUS (beta)",
    "CiRCLE",
    # TODO: Don't hardcode this. Also, this is not needed for tree-based models.
    # We use this only for linear models. If we remove Ridge, we can cut this liability.
]
version_order = {version: i + 1 for i, version in enumerate(versions)}

merged_df["Version"] = merged_df["Version"].map(version_order)
merged_df["Version"]
['UNiVERSE' 'BUDDiES' 'BUDDiES PLUS' 'FESTiVAL' 'MiLK' 'FiNALE'
 'maimaiでらっくす' 'PiNK PLUS' 'MURASAKi' 'MURASAKi PLUS' 'PRiSM (beta)'
 'ORANGE' 'MiLK PLUS' 'UNiVERSE PLUS' 'FESTiVAL PLUS' 'PiNK' 'ORANGE PLUS'
 'maimai PLUS' 'PRiSM PLUS (beta)' 'maimaiでらっくす PLUS' 'Splash'
 'Splash PLUS' 'maimai' 'CiRCLE (experimental)' 'GreeN' 'GreeN PLUS']
Out[28]:
0      18.0
1      22.0
2      18.0
3      23.0
4      20.0
       ... 
531    17.0
532    12.0
533    13.0
534    14.0
535     7.0
Name: Version, Length: 536, dtype: float64
In [29]:
# Convert "type" into simple numeric categories
type_mapping = {"std": 0, "dx": 1}
merged_df["type"] = merged_df["type"].map(type_mapping)
merged_df.head(3)
Out[29]:
songId Genre Version type difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner songId_length songId_English songId_Japanese songId_Punctuation
0 躯樹の墓守 maimai 18.0 1 basic 6 94.23 AAA 756 6.0 211.0 3501.0 躯樹の墓守 1.0 1.736984e+18 2025.0 1.0 15.0 214.0 16.0 6.0 10.0 6.0 252.0 - 5 0 5 0
1 系ぎて maimai 22.0 1 basic 7 96.63 AAA 1032 7.0 88.0 4250.0 系ぎて 1.0 1.711724e+18 2024.0 3.0 29.0 282.0 26.0 8.0 14.0 14.0 344.0 - 3 0 3 0
2 廻廻奇譚 POPS&ANIME 18.0 1 advanced 7 97.65 S 1122 7.5 185.0 3354.0 廻廻奇譚 1.0 1.720017e+18 2024.0 7.0 3.0 301.0 26.0 19.0 22.0 6.0 374.0 - 4 0 4 0
In [30]:
# Convert "difficulty" into simple numeric categories
difficulty_mapping = {
    "basic": 0,
    "advanced": 1,
    "expert": 2,
    "master": 3,
    "re:master": 4,
}
merged_df["difficulty"] = merged_df["difficulty"].map(difficulty_mapping)
merged_df.head(3)
Out[30]:
songId Genre Version type difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner songId_length songId_English songId_Japanese songId_Punctuation
0 躯樹の墓守 maimai 18.0 1 0 6 94.23 AAA 756 6.0 211.0 3501.0 躯樹の墓守 1.0 1.736984e+18 2025.0 1.0 15.0 214.0 16.0 6.0 10.0 6.0 252.0 - 5 0 5 0
1 系ぎて maimai 22.0 1 0 7 96.63 AAA 1032 7.0 88.0 4250.0 系ぎて 1.0 1.711724e+18 2024.0 3.0 29.0 282.0 26.0 8.0 14.0 14.0 344.0 - 3 0 3 0
2 廻廻奇譚 POPS&ANIME 18.0 1 1 7 97.65 S 1122 7.5 185.0 3354.0 廻廻奇譚 1.0 1.720017e+18 2024.0 7.0 3.0 301.0 26.0 19.0 22.0 6.0 374.0 - 4 0 4 0
In [31]:
merged_df.describe(include="all")
Out[31]:
songId Genre Version type difficulty Level Achv Rank DX Score Chart Constant bpm releaseDate Name Play Count Last Played Last Played_Year Last Played_Month Last Played_Day noteCounts.tap noteCounts.hold noteCounts.slide noteCounts.touch noteCounts.break noteCounts.total noteDesigner songId_length songId_English songId_Japanese songId_Punctuation
count 536 536 521.000000 536.000000 536.000000 536 536.000000 536 536.000000 536.000000 526.000000 522.000000 383 383.000000 3.830000e+02 383.000000 383.000000 383.000000 536.000000 536.000000 536.000000 536.000000 536.000000 536.000000 527 536.000000 536.000000 536.000000 536.000000
unique 440 6 NaN NaN NaN 15 NaN 9 NaN NaN NaN NaN 319 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 45 NaN NaN NaN NaN
top INTERNET OVERDOSE niconico&VOCALOID™ NaN NaN NaN 12 NaN S+ NaN NaN NaN NaN INTERNET OVERDOSE NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN はっぴー NaN NaN NaN NaN
freq 3 146 NaN NaN NaN 78 NaN 92 NaN NaN NaN NaN 3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 79 NaN NaN NaN NaN
mean NaN NaN 17.366603 0.720149 2.055970 NaN 98.289291 NaN 1598.423507 10.935448 174.192015 3219.281609 NaN 2.561358 1.757252e+18 2025.208877 6.156658 17.389034 376.194030 51.875000 44.673507 22.535448 27.848881 532.807836 NaN 10.867537 5.227612 4.604478 0.514925
std NaN NaN 6.431739 0.449345 0.580559 NaN 2.513036 NaN 390.847516 1.465903 33.361900 1288.210058 NaN 2.808135 1.083736e+16 0.478024 3.780842 9.088142 108.806712 26.458312 30.729080 23.948985 22.903485 130.282505 NaN 8.753274 8.251902 4.833711 1.113956
min NaN NaN 1.000000 0.000000 0.000000 NaN 86.690000 NaN 426.000000 6.000000 70.000000 0.000000 NaN 1.000000 1.711724e+18 2024.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 142.000000 NaN 0.000000 0.000000 0.000000 0.000000
25% NaN NaN 13.000000 0.000000 2.000000 NaN 97.130000 NaN 1329.000000 10.000000 150.000000 2346.000000 NaN 1.000000 1.752878e+18 2025.000000 1.000000 9.000000 311.000000 34.000000 22.000000 0.000000 11.000000 443.000000 NaN 6.000000 0.000000 0.000000 0.000000
50% NaN NaN 19.000000 1.000000 2.000000 NaN 99.025000 NaN 1590.000000 11.000000 176.000000 3543.000000 NaN 1.000000 1.756314e+18 2025.000000 7.000000 18.000000 378.000000 49.000000 40.000000 17.000000 22.000000 530.000000 NaN 9.000000 0.000000 4.000000 0.000000
75% NaN NaN 23.000000 1.000000 2.000000 NaN 100.185000 NaN 1878.000000 12.000000 190.000000 4271.000000 NaN 3.000000 1.767130e+18 2025.000000 8.000000 26.000000 450.250000 69.000000 59.250000 34.250000 37.000000 626.000000 NaN 14.000000 10.000000 8.000000 1.000000
max NaN NaN 25.000000 1.000000 3.000000 NaN 101.000000 NaN 2958.000000 13.500000 339.000000 4944.000000 NaN 22.000000 1.769882e+18 2026.000000 12.000000 31.000000 726.000000 184.000000 185.000000 153.000000 180.000000 986.000000 NaN 149.000000 120.000000 28.000000 7.000000
In [32]:
# Filter out numerical columns for correlation matrix
correlation_matrix = merged_df.select_dtypes(include=["number"]).corr()

fig = plt.figure(clear=True, figsize=(14, 12))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    square=True,
    cmap="coolwarm",
    center=0,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix", fontsize=16, pad=20)
plt.show()
No description has been provided for this image
In [33]:
def correlation_rankings(
    corr_mat: pd.DataFrame, target_column: str, threshold: float = 0.95
):
    """Print all abs(correlation with anything) that exceeds a threshold"""
    output = []

    for col in corr_mat.columns:
        if col == target_column:
            continue

        strong_correlations = (
            corr_mat[col][corr_mat[col].abs() > threshold]
            .drop(col)
            .sort_values(ascending=False, key=lambda x: x.abs())
        )

        if not strong_correlations.empty:
            formatted_corrs: list = []
            for corr_col, value in strong_correlations.items():
                color: str = "bold green" if value > 0 else "bold red"
                formatted_corrs.append(f"[{color}]{corr_col} ({value:.2f})[/{color}]")

            corr_with_target = corr_mat.loc[col, target_column]
            correlations_str = ", ".join(formatted_corrs)

            header = "".join(
                [
                    f"[bold sky_blue1]{col} ({target_column}: {corr_with_target:.2f}, ",
                    f"count: {len(strong_correlations)}):[/bold sky_blue1]",
                ]
            )
            full_line = f"{header} [{correlations_str}]"

            output.append(full_line)

    output.sort(
        key=lambda x: float(x.split(f"({target_column}: ")[1].split(",")[0]),
        reverse=True,
    )
    if output:
        rich.print("\n".join(output))


correlation_rankings(correlation_matrix, target_column="Achv")
DX Score (Achv: -0.07, count: 1): [noteCounts.total (1.00)]
noteCounts.total (Achv: -0.07, count: 1): [DX Score (1.00)]
In [34]:
# Drop collinear columns
merged_df.drop(
    columns=[
        "DX Score",
    ],
    inplace=True,
)

correlation_matrix = merged_df.select_dtypes(include=["number"]).corr()
correlation_rankings(correlation_matrix, target_column="Achv")

fig = plt.figure(clear=True, figsize=(14, 12))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt=".2f",
    square=True,
    cmap="coolwarm",
    center=0,
    linewidths=0.5,
    cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix (Reduced)", fontsize=16, pad=20)
plt.show()
No description has been provided for this image
In [35]:
# Pairwise correlation with seaborn
sns.pairplot(
    merged_df,
    diag_kind="kde",
    plot_kws={"alpha": 0.4, "s": 15},
    corner=True,
    height=1.7,
)
plt.show()
No description has been provided for this image

Regression

In [36]:
print(merged_df.columns)

numerical_features = (
    merged_df.select_dtypes(include=["number"]).columns.drop("Achv").tolist()
)
categorical_features = ["Genre"]

drop_columns = [
    feature
    for feature in merged_df.columns
    if feature not in numerical_features + categorical_features + ["Achv"]
]
print(f"Dropping columns: {drop_columns}")
merged_df.drop(
    columns=drop_columns,
    inplace=True,
)

merged_df.dropna(inplace=True)
Index(['songId', 'Genre', 'Version', 'type', 'difficulty', 'Level', 'Achv',
       'Rank', 'Chart Constant', 'bpm', 'releaseDate', 'Name', 'Play Count',
       'Last Played', 'Last Played_Year', 'Last Played_Month',
       'Last Played_Day', 'noteCounts.tap', 'noteCounts.hold',
       'noteCounts.slide', 'noteCounts.touch', 'noteCounts.break',
       'noteCounts.total', 'noteDesigner', 'songId_length', 'songId_English',
       'songId_Japanese', 'songId_Punctuation'],
      dtype='object')
Dropping columns: ['songId', 'Level', 'Rank', 'Name', 'noteDesigner']
In [37]:
# merged_df.to_csv("maimai.cleaned.csv", index=False)
In [58]:
merged_df.sort_values(by="Last Played", inplace=True)

testset_size = 0.5
X_train, X_test = merged_df.iloc[
    : int(len(merged_df) * (1 - testset_size)), :
], merged_df.iloc[
    int(len(merged_df) * (1 - testset_size)) :, :
]
y_train, y_test = X_train.pop("Achv"), X_test.pop("Achv")
In [59]:
# Visualize distribution of Achv
plt.figure(figsize=(6, 3))
sns.histplot(y_train, bins=100, kde=True)
plt.title("Distribution of y_train")
plt.xlabel("Achv (%)")
plt.show()

TRANSFORM_ACHV = ["box-cox", "log", "none"][0]

if TRANSFORM_ACHV == "box-cox":
    # Convert non-linear Achv score to a (almost) gaussian like distribution
    y_train, achv_boxcox_lambda = stats.boxcox(y_train)
    y_test = stats.boxcox(y_test, lmbda=achv_boxcox_lambda)
    # The target variable now has crazy large values, so we normalize it
    scaler = StandardScaler()
    y_train = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
    y_test = scaler.transform(y_test.reshape(-1, 1)).flatten()
    print(f"Achv Box-Cox Lambda: {achv_boxcox_lambda}")
elif TRANSFORM_ACHV == "log":
    y_train = np.log(y_train / (101 - y_train))
    y_test = np.log(y_test / (101 - y_test))
else:
    pass

plt.figure(figsize=(6, 3))
sns.histplot(y_train, bins=100, kde=True)
plt.title(f"Distribution of y_train after {TRANSFORM_ACHV} Transformation")
plt.xlabel("Achv")
plt.ylabel("Frequency")
plt.show()


def Achv_inverse_transform(y_transformed):
    if TRANSFORM_ACHV == "box-cox":
        # Inverse Box-Cox transformation
        y = np.asarray(y_transformed).reshape(-1, 1)
        y = scaler.inverse_transform(y).flatten()
        y = scipy.special.inv_boxcox(y, achv_boxcox_lambda)
    elif TRANSFORM_ACHV == "log":
        y = 101 / (1 + np.exp(-y_transformed))
    else:
        y = y_transformed
    return y
No description has been provided for this image
Achv Box-Cox Lambda: 27.019015925780376
No description has been provided for this image
In [60]:
preprocessor = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(
                steps=[
                    (
                        "poly",
                        PolynomialFeatures(
                            degree=1, include_bias=False, interaction_only=False
                        ),
                    ),
                    ("scaler", StandardScaler()),
                ]
            ),
            numerical_features,
        ),
        (
            "cat",
            OneHotEncoder(
                sparse_output=False,
                handle_unknown="error",
                categories=[
                    merged_df[column].unique().tolist()
                    for column in categorical_features
                ],
            ),
            categorical_features,
        ),
    ]
)
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        (
            "regressor",
            ElasticNetCV(
                max_iter=100000,
                l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
                random_state=RANDOM_STATE,
                cv=TimeSeriesSplit(n_splits=5),
            ),
        ),
    ]
)

pipeline.fit(X_train, y_train)
chosen_alpha = pipeline.named_steps["regressor"].alpha_
print("Chosen alpha:", chosen_alpha)
l1_ratio = pipeline.named_steps["regressor"].l1_ratio_
print("Chosen l1_ratio:", l1_ratio)
Chosen alpha: 0.06262986919313132
Chosen l1_ratio: 1.0
In [68]:
y_train_pred = pipeline.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_mse = mean_absolute_error(y_train, y_train_pred)
train_mse_transformed = mean_absolute_error(
    Achv_inverse_transform(y_train), Achv_inverse_transform(y_train_pred)
)
print("Train set performance:")
print(f"R^2 Score: {train_r2:.4f}")
print(f"Mean Absolute Error: {train_mse:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {train_mse_transformed:.4f}")

y_test_pred = pipeline.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_absolute_error(y_test, y_test_pred)
test_mse_transformed = mean_absolute_error(
    Achv_inverse_transform(y_test), Achv_inverse_transform(y_test_pred)
)
print("Test set performance:")
print(f"R^2 Score: {test_r2:.4f}")
print(f"Mean Absolute Error: {test_mse:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {test_mse_transformed:.4f}")

cat_encoder = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)

all_feature_names = (
    list(cat_feature_names)
    + pipeline.named_steps["preprocessor"]
    .named_transformers_["num"]
    .named_steps["poly"]
    .get_feature_names_out(numerical_features)
    .tolist()
)

regressor_step = pipeline.named_steps["regressor"]

results_df = pd.DataFrame(
    {
        "Feature": [all_feature_names[i] for i in range(len(all_feature_names))],
        "Coefficient": regressor_step.coef_
    }
).sort_values(by="Coefficient", ascending=False, key=abs)
results_df = results_df[results_df["Coefficient"] != 0]
display(results_df)
Train set performance:
R^2 Score: 0.5654
Mean Absolute Error: 0.5362
Mean Absolute Error (Inverse Transformed): 1.1676
Test set performance:
R^2 Score: 0.3839
Mean Absolute Error: 0.6909
Mean Absolute Error (Inverse Transformed): 1.3568
Feature Coefficient
3 Genre_東方Project -0.571551
7 type 0.480025
9 Chart Constant 0.320098
6 Version 0.169017
0 Genre_niconico&VOCALOID™ 0.102858
19 noteCounts.slide 0.019177
4 Genre_GAME&VARIETY 0.003423
In [69]:
y_train_inv = Achv_inverse_transform(y_train)
y_train_pred_inv = Achv_inverse_transform(y_train_pred)
y_test_inv = Achv_inverse_transform(y_test)
y_test_pred_inv = Achv_inverse_transform(y_test_pred)

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Common scatter kwargs
scatter_kws = {"alpha": 0.6, "s": 40, "edgecolor": "white", "linewidth": 0.5}
line_kws = {"color": "#333333", "linestyle": "--", "linewidth": 2, "alpha": 0.8}

# Predicted vs Actual for training set
axes[0].scatter(y_train_inv, y_train_pred_inv, **scatter_kws)
axes[0].plot(
    [y_train_inv.min(), y_train_inv.max()],
    [y_train_inv.min(), y_train_inv.max()],
    **line_kws,
)
axes[0].set_xlabel("Actual Achv")
axes[0].set_ylabel("Predicted Achv")
axes[0].set_title("Training Set: Predicted vs Actual")

# Predicted vs Actual for test set
axes[1].scatter(y_test_inv, y_test_pred_inv, color="C1", **scatter_kws)
axes[1].plot(
    [y_test_inv.min(), y_test_inv.max()],
    [y_test_inv.min(), y_test_inv.max()],
    **line_kws,
)
axes[1].set_xlabel("Actual Achv")
axes[1].set_ylabel("Predicted Achv")
axes[1].set_title("Test Set: Predicted vs Actual")

# Residual QQ plot
residuals = y_test - y_test_pred
stats.probplot(residuals, dist="norm", plot=axes[2])
axes[2].get_lines()[0].set_markerfacecolor("C2")
axes[2].get_lines()[0].set_markeredgecolor("white")
axes[2].get_lines()[0].set_alpha(0.6)
axes[2].get_lines()[1].set_color("#333333")
axes[2].get_lines()[1].set_linewidth(2)
axes[2].set_title("QQ Plot of Residuals")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [90]:
preprocessor_rf = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numerical_features),
        (
            "cat",
            OneHotEncoder(
                sparse_output=True,
                handle_unknown="error",
                categories=[
                    merged_df[column].unique().tolist()
                    for column in categorical_features
                ],
            ),
            categorical_features,
        ),
    ]
)
pipeline_rf = Pipeline(
    steps=[
        ("preprocessor", preprocessor_rf),
        (
            "rfr",
            RandomForestRegressor(
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)
param_grid = {
    "rfr__max_depth": np.arange(2, 5, 1),
    "rfr__min_samples_leaf": np.arange(2, 50, 1),
    "rfr__ccp_alpha": np.arange(0, 0.04, 0.001),
}

grid_search = RandomizedSearchCV(
    pipeline_rf,
    param_grid,
    scoring="r2",
    n_jobs=-1,
    verbose=10,
    random_state=RANDOM_STATE,
    n_iter=100,
    cv=TimeSeriesSplit(n_splits=10),
)

grid_search.fit(X_train, y_train)
pipeline_rf = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV R^2: {grid_search.best_score_:.4f}")
Fitting 10 folds for each of 100 candidates, totalling 1000 fits

Best parameters: {'rfr__min_samples_leaf': np.int64(4), 'rfr__max_depth': np.int64(4), 'rfr__ccp_alpha': np.float64(0.003)}
Best CV R^2: -0.1283
In [91]:
y_train_pred_rf = pipeline_rf.predict(X_train)
train_r2_rf = r2_score(y_train, y_train_pred_rf)
train_mse_rf = mean_absolute_error(y_train, y_train_pred_rf)
train_mse_rf_transformed = mean_absolute_error(
    Achv_inverse_transform(y_train), Achv_inverse_transform(y_train_pred_rf)
)
print("Train set performance:")
print(f"R^2 Score: {train_r2_rf:.4f}")
print(f"Mean Absolute Error: {train_mse_rf:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {train_mse_rf_transformed:.4f}")

y_test_pred_rf = pipeline_rf.predict(X_test)
test_r2_rf = r2_score(y_test, y_test_pred_rf)
test_mse_rf = mean_absolute_error(y_test, y_test_pred_rf)
test_mse_rf_transformed = mean_absolute_error(
    Achv_inverse_transform(y_test), Achv_inverse_transform(y_test_pred_rf)
)
print("Test set performance:")
print(f"R^2 Score: {test_r2_rf:.4f}")
print(f"Mean Absolute Error: {test_mse_rf:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {test_mse_rf_transformed:.4f}")
Train set performance:
R^2 Score: 0.7257
Mean Absolute Error: 0.4245
Mean Absolute Error (Inverse Transformed): 0.9093
Test set performance:
R^2 Score: 0.4535
Mean Absolute Error: 0.7243
Mean Absolute Error (Inverse Transformed): 1.3905
In [92]:
y_train_inv_rf = Achv_inverse_transform(y_train)
y_train_pred_inv_rf = Achv_inverse_transform(y_train_pred_rf)
y_test_inv_rf = Achv_inverse_transform(y_test)
y_test_pred_inv_rf = Achv_inverse_transform(y_test_pred_rf)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Common scatter kwargs
scatter_kws = {"alpha": 0.6, "s": 40, "edgecolor": "white", "linewidth": 0.5}
line_kws = {"color": "#333333", "linestyle": "--", "linewidth": 2, "alpha": 0.8}

# 1. Predicted vs Actual for training set
axes[0].scatter(y_train_inv_rf, y_train_pred_inv_rf, color="C0", **scatter_kws)
axes[0].plot(
    [y_train_inv_rf.min(), y_train_inv_rf.max()],
    [y_train_inv_rf.min(), y_train_inv_rf.max()],
    **line_kws,
)
axes[0].set_xlabel("Actual Achv")
axes[0].set_ylabel("Predicted Achv")
axes[0].set_title("Training Set: Predicted vs Actual (RF)")

# 2. Predicted vs Actual for test set
axes[1].scatter(y_test_inv_rf, y_test_pred_inv_rf, color="C1", **scatter_kws)
axes[1].plot(
    [y_test_inv_rf.min(), y_test_inv_rf.max()],
    [y_test_inv_rf.min(), y_test_inv_rf.max()],
    **line_kws,
)
axes[1].set_xlabel("Actual Achv")
axes[1].set_ylabel("Predicted Achv")
axes[1].set_title("Test Set: Predicted vs Actual (RF)")

plt.tight_layout()
plt.show()
No description has been provided for this image
In [93]:
rf_features_step = pipeline_rf.named_steps["preprocessor"].named_transformers_["num"]
rf_numeric_feature_names = rf_features_step.get_feature_names_out(numerical_features)

cat_encoder_rf = pipeline_rf.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names_rf = cat_encoder_rf.get_feature_names_out(categorical_features)

all_feature_names_rf = list(rf_numeric_feature_names) + list(cat_feature_names_rf)
rf_model = pipeline_rf.named_steps["rfr"]

# Calculate Split Counts (Frequency)
n_features = len(all_feature_names_rf)
split_counts = np.zeros(n_features, dtype=int)

for tree in rf_model.estimators_:
    tree_features = tree.tree_.feature
    # Filter out leaf nodes (indicated by -2)
    indices = tree_features[tree_features >= 0]
    np.add.at(split_counts, indices, 1)

# Create DataFrame with Importance and Splits
importance_df = pd.DataFrame(
    {
        "Feature": all_feature_names_rf,
        "Importance": rf_model.feature_importances_,
        "Splits": split_counts,
    }
).sort_values(by="Importance", ascending=False)

# Filter for visualization
useful_features = importance_df[importance_df["Importance"] > 0].copy()

# Create custom labels: "Feature (Importance | Splits)"
useful_features["Label"] = useful_features.apply(
    lambda x: f"{x['Importance']:.3f} ({int(x['Splits'])})", axis=1
)

display(useful_features)

# Visualization
fig, ax = plt.subplots(figsize=(14, len(useful_features) * 0.4 + 2))

# Create gradient colors based on Importance
norm = plt.Normalize(useful_features["Importance"].min(), useful_features["Importance"].max())
colors = plt.cm.viridis(norm(useful_features["Importance"]))

bars = ax.barh(
    range(len(useful_features)),
    useful_features["Importance"],
    color=colors,
    edgecolor="none",
    height=0.7
)

# Add text labels inside or next to bars
for i, (bar, label) in enumerate(zip(bars, useful_features["Label"])):
    width = bar.get_width()
    ax.text(
        width,
        bar.get_y() + bar.get_height() / 2,
        f" {label}",
        va="center",
        ha="left",
        fontsize=10,
        color="#333333",
        fontweight="medium"
    )

ax.set_yticks(range(len(useful_features)))
ax.set_yticklabels(useful_features["Feature"], fontsize=11)
ax.set_xlabel("Importance (Gain)", fontsize=12)
ax.set_title("Feature Importances (Gain) and Split Counts", fontsize=15, pad=15)
ax.invert_yaxis()

# Remove top and right spines for cleaner look
sns.despine()

plt.tight_layout()
plt.show()
Feature Importance Splits Label
7 Last Played 0.340450 175 0.340 (175)
3 Chart Constant 0.294145 209 0.294 (209)
6 Play Count 0.062840 63 0.063 (63)
13 noteCounts.slide 0.046182 84 0.046 (84)
9 Last Played_Month 0.033795 23 0.034 (23)
12 noteCounts.hold 0.031515 53 0.032 (53)
16 noteCounts.total 0.028551 35 0.029 (35)
0 Version 0.025488 31 0.025 (31)
15 noteCounts.break 0.020023 52 0.020 (52)
4 bpm 0.019168 59 0.019 (59)
5 releaseDate 0.018831 53 0.019 (53)
11 noteCounts.tap 0.015146 26 0.015 (26)
19 songId_Japanese 0.013148 38 0.013 (38)
2 difficulty 0.012306 15 0.012 (15)
14 noteCounts.touch 0.009308 33 0.009 (33)
18 songId_English 0.006773 12 0.007 (12)
20 songId_Punctuation 0.006287 10 0.006 (10)
10 Last Played_Day 0.005070 21 0.005 (21)
23 Genre_POPS&ANIME 0.004546 16 0.005 (16)
17 songId_length 0.003153 10 0.003 (10)
21 Genre_niconico&VOCALOID™ 0.002195 8 0.002 (8)
25 Genre_GAME&VARIETY 0.000928 2 0.001 (2)
22 Genre_maimai 0.000154 1 0.000 (1)
C:\Users\123er\AppData\Local\Temp\ipykernel_7824\3721033785.py:77: UserWarning: Glyph 65286 (\N{FULLWIDTH AMPERSAND}) missing from font(s) Arial.
  plt.tight_layout()
d:\Personal Data\Repositories\personal-repo\maimai-dx-regression\.venv\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 65286 (\N{FULLWIDTH AMPERSAND}) missing from font(s) Arial.
  fig.canvas.print_figure(bytes_io, **kw)
No description has been provided for this image
In [94]:
# Show the test set predictions and actual scores
test_results_df = pd.DataFrame(
    {
        "Actual Achv": Achv_inverse_transform(y_test),
        "Predicted Achv (ElasticNet)": Achv_inverse_transform(y_test_pred),
        "Predicted Achv (RandomForest)": Achv_inverse_transform(y_test_pred_rf),
    }
)
display(test_results_df)
Actual Achv Predicted Achv (ElasticNet) Predicted Achv (RandomForest)
0 98.36 98.032286 96.445536
1 100.30 98.757396 99.157519
2 98.59 98.408814 98.791806
3 98.19 98.552006 98.951504
4 100.57 99.991425 99.726866
... ... ... ...
175 100.71 99.864643 99.722681
176 94.54 96.308191 96.253539
177 96.17 97.680423 97.167720
178 95.30 96.507082 96.282976
179 94.37 96.097244 96.287665

180 rows × 3 columns