Predicting maimai DX player map achievement scores¶
In [1]:
RANDOM_STATE = 42
In [2]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
import rich
import scipy
import scipy.stats as stats
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.model_selection import TimeSeriesSplit
In [3]:
# Set a global aesthetic theme
sns.set_theme(style="whitegrid", context="notebook", palette="muted")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["axes.titlesize"] = 14
plt.rcParams["axes.labelsize"] = 12
pd.set_option("display.max_columns", None)
In [4]:
# Loading the data
maimai_path = Path("maimai.csv")
bpm_path = Path("bpm.csv")
maimai_df = pd.read_csv(maimai_path)
bpm_df = pd.read_csv(bpm_path)
In [5]:
maimai_df
Out[5]:
| Song | Genre | Version | Chart | Difficulty | Level | Achv | Rank | FC/AP | Sync | DX ✦ | DX % | DX Score | Chart Constant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | UNiVERSE | DX | BASIC | 6 | 94.23% | AAA | FC | SYNC | 0 | 75.40% | 570/756 | 6 |
| 1 | 系ぎて | maimai | BUDDiES | DX | BASIC | 7 | 96.63% | AAA | - | SYNC | 0 | 74.20% | 766/1032 | 7 |
| 2 | 廻廻奇譚 | POPS&ANIME | UNiVERSE | DX | ADVANCED | 7 | 97.65% | S | - | SYNC | 0 | 79.00% | 886/1122 | 7.5 |
| 3 | INTERNET YAMERO | POPS&ANIME | BUDDiES PLUS | DX | ADVANCED | 7+ | 96.34% | AAA | - | SYNC | 0 | 73.20% | 854/1167 | 7.8 |
| 4 | INTERNET OVERDOSE | POPS&ANIME | FESTiVAL | DX | ADVANCED | 7 | 96.11% | AAA | - | SYNC | 0 | 76.80% | 820/1068 | 7.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 535 | Kiss Me Kiss | オンゲキ&CHUNITHM | Splash PLUS | DX | MASTER | 12+ | 94.07% | AAA | - | - | 0 | 75.80% | 1635/2157 | 12.8 |
| 536 | Session High⤴ | オンゲキ&CHUNITHM | MiLK PLUS | STD | MASTER | 12+ | 94.75% | AAA | - | - | 0 | 76.20% | 1226/1608 | 12.8 |
| 537 | イロトリドリのメロディ | オンゲキ&CHUNITHM | FiNALE | STD | MASTER | 12+ | 94.80% | AAA | - | - | 0 | 73.40% | 1267/1725 | 12.7 |
| 538 | Agitation! | オンゲキ&CHUNITHM | maimaiでらっくす | DX | MASTER | 12 | 96.04% | AAA | - | - | 0 | 73.20% | 1461/1995 | 12 |
| 539 | Counselor | オンゲキ&CHUNITHM | PiNK | STD | MASTER | 12 | 97.02% | S | - | SYNC | 0 | 81.90% | 1769/2160 | 12.4 |
540 rows × 14 columns
In [6]:
bpm_df
Out[6]:
| songId | bpm | releaseDate | |
|---|---|---|---|
| 0 | "411Ψ892" | 184.0 | 2023-02-03 |
| 1 | #狂った民族2 PRAVARGYAZOOQA | 170.0 | 2022-11-18 |
| 2 | +♂ | 180.0 | 2015-12-09 |
| 3 | -OutsideR:RequieM- | 181.0 | 2019-03-08 |
| 4 | 1000年生きてる | 100.0 | 2023-09-14 |
| ... | ... | ... | ... |
| 1383 | ヤミナベ!!!! | 264.0 | 2025-12-24 |
| 1384 | 勝手に生きましょ | 176.0 | 2026-01-09 |
| 1385 | 化けの花 | 180.0 | 2026-01-09 |
| 1386 | 胡蝶乃舞 | 180.0 | 2026-01-23 |
| 1387 | 零號車輛 | 240.0 | 2026-01-23 |
1388 rows × 3 columns
In [7]:
# Merging datasets
maimai_df.rename(columns={"Song": "songId"}, inplace=True)
merged_df = maimai_df.merge(bpm_df, on="songId", how="left")
print(f"Duplicated: {merged_df.duplicated().sum()}")
merged_df.head()
Duplicated: 0
Out[7]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | FC/AP | Sync | DX ✦ | DX % | DX Score | Chart Constant | bpm | releaseDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | UNiVERSE | DX | BASIC | 6 | 94.23% | AAA | FC | SYNC | 0 | 75.40% | 570/756 | 6 | 211.0 | 2022-02-10 |
| 1 | 系ぎて | maimai | BUDDiES | DX | BASIC | 7 | 96.63% | AAA | - | SYNC | 0 | 74.20% | 766/1032 | 7 | 88.0 | 2024-02-29 |
| 2 | 廻廻奇譚 | POPS&ANIME | UNiVERSE | DX | ADVANCED | 7 | 97.65% | S | - | SYNC | 0 | 79.00% | 886/1122 | 7.5 | 185.0 | 2021-09-16 |
| 3 | INTERNET YAMERO | POPS&ANIME | BUDDiES PLUS | DX | ADVANCED | 7+ | 96.34% | AAA | - | SYNC | 0 | 73.20% | 854/1167 | 7.8 | 185.0 | 2024-03-21 |
| 4 | INTERNET OVERDOSE | POPS&ANIME | FESTiVAL | DX | ADVANCED | 7 | 96.11% | AAA | - | SYNC | 0 | 76.80% | 820/1068 | 7.5 | 163.0 | 2023-03-23 |
In [8]:
# Clean up unnecessary columns
merged_df.drop(columns=["FC/AP", "Sync", "DX ✦", "DX %"], inplace=True)
# Expose missing values
merged_df["Chart Constant"] = pd.to_numeric(
merged_df["Chart Constant"], errors="coerce"
)
In [9]:
# convert releaseDate to numerical values
merged_df["releaseDate"] = pd.to_datetime(merged_df["releaseDate"], errors="coerce")
merged_df["releaseDate"] = (
merged_df["releaseDate"] - merged_df["releaseDate"].min()
).dt.days
merged_df.head(3)
Out[9]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | UNiVERSE | DX | BASIC | 6 | 94.23% | AAA | 570/756 | 6.0 | 211.0 | 3501.0 |
| 1 | 系ぎて | maimai | BUDDiES | DX | BASIC | 7 | 96.63% | AAA | 766/1032 | 7.0 | 88.0 | 4250.0 |
| 2 | 廻廻奇譚 | POPS&ANIME | UNiVERSE | DX | ADVANCED | 7 | 97.65% | S | 886/1122 | 7.5 | 185.0 | 3354.0 |
In [10]:
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
merged_df[merged_df.isna().any(axis=1)]
Total rows with missing values: 15 Rows with missing values:
Out[10]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | さくゆいたいそう | POPS&ANIME | BUDDiES | DX | ADVANCED | 6+ | 99.26% | SS | 829/1005 | 6.8 | NaN | NaN |
| 78 | 前前前世 | POPS&ANIME | MURASAKi PLUS | STD | EXPERT | 9 | 97.21% | S | 1111/1395 | NaN | 190.0 | 1807.0 |
| 95 | ヒトガタ | POPS&ANIME | UNiVERSE PLUS | DX | EXPERT | 10+ | 99.78% | SS+ | 1463/1650 | 10.8 | NaN | NaN |
| 100 | さくゆいたいそう | POPS&ANIME | BUDDiES | DX | EXPERT | 9+ | 99.46% | SS | 1422/1614 | 9.7 | NaN | NaN |
| 129 | メズマライザー | niconico&VOCALOID™ | PRiSM PLUS (beta) | DX | EXPERT | 10 | 100.65% | SSS+ | 1425/1572 | 10.5 | NaN | NaN |
| 130 | サイエンス | niconico&VOCALOID™ | CiRCLE (experimental) | DX | EXPERT | 10 | 100.78% | SSS+ | 1468/1596 | 10.0 | 188.0 | NaN |
| 196 | ラグトレイン | niconico&VOCALOID™ | UNiVERSE | DX | EXPERT | 9+ | 96.85% | AAA | 1280/1608 | 9.8 | NaN | NaN |
| 224 | 有頂天ドリーマーズ | 東方Project | CiRCLE (experimental) | DX | EXPERT | 10+ | 100.53% | SSS+ | 1182/1338 | 10.8 | 188.0 | NaN |
| 233 | Ultimate taste | 東方Project | PRiSM (beta) | DX | EXPERT | 11 | 99.97% | SS+ | 1547/1785 | 11.0 | NaN | NaN |
| 367 | 411Ψ892 | maimai | FESTiVAL | DX | EXPERT | 12+ | 99.08% | SS | 1568/1893 | 12.7 | NaN | NaN |
| 448 | さくゆいたいそう | POPS&ANIME | BUDDiES | DX | MASTER | 12+ | 99.25% | SS | 1763/2043 | 12.6 | NaN | NaN |
| 467 | メズマライザー | niconico&VOCALOID™ | PRiSM PLUS (beta) | DX | MASTER | 11+ | 100.08% | SSS | 1850/2025 | 11.9 | NaN | NaN |
| 468 | サイエンス | niconico&VOCALOID™ | CiRCLE (experimental) | DX | MASTER | 13 | 98.35% | S+ | 1875/2250 | 13.0 | 188.0 | NaN |
| 495 | エイリアンエイリアン | niconico&VOCALOID™ | MURASAKi PLUS | STD | MASTER | 12+ | 96.18% | AAA | 1775/2268 | 12.8 | NaN | NaN |
| 508 | 有頂天ドリーマーズ | 東方Project | CiRCLE (experimental) | DX | MASTER | 13 | 92.22% | AA | 1415/1935 | 13.1 | 188.0 | NaN |
In [11]:
# Fill missing Chart Constant values based on Level
level_mapping = merged_df.groupby("Level")["Chart Constant"].median()
print(level_mapping)
merged_df.fillna(
{"Chart Constant": merged_df["Level"].map(level_mapping)}, inplace=True
)
print(
f"\nRemaining missing Chart Constant values: {merged_df['Chart Constant'].isna().sum()}"
)
merged_df[["Level", "Chart Constant"]].head(10)
Level 10 10.20 10+ 10.70 11 11.20 11+ 11.80 12 12.35 12+ 12.80 13 13.00 6 6.00 6+ 6.80 7 7.20 7+ 7.80 8 8.20 8+ 8.70 9 9.30 9+ 9.80 Name: Chart Constant, dtype: float64 Remaining missing Chart Constant values: 0
Out[11]:
| Level | Chart Constant | |
|---|---|---|
| 0 | 6 | 6.0 |
| 1 | 7 | 7.0 |
| 2 | 7 | 7.5 |
| 3 | 7+ | 7.8 |
| 4 | 7 | 7.5 |
| 5 | 6+ | 6.8 |
| 6 | 7+ | 7.7 |
| 7 | 7 | 7.5 |
| 8 | 7 | 7.0 |
| 9 | 7+ | 7.9 |
In [12]:
# Simply map Level "6" -> 6.2, "6+" -> 6.8 for remaining missing values
def map_level_to_constant(level):
if isinstance(level, str) and level.endswith("+"):
return float(level[:-1]) + 0.8
else:
try:
return float(level) + 0.2
except ValueError:
return None
merged_df["Chart Constant"] = merged_df.apply(
lambda row: (
map_level_to_constant(row["Level"])
if pd.isna(row["Chart Constant"])
else row["Chart Constant"]
),
axis=1,
)
print(
f"\nRemaining missing Chart Constant values: {merged_df['Chart Constant'].isna().sum()}"
)
merged_df[["Level", "Chart Constant"]].head(10)
Remaining missing Chart Constant values: 0
Out[12]:
| Level | Chart Constant | |
|---|---|---|
| 0 | 6 | 6.0 |
| 1 | 7 | 7.0 |
| 2 | 7 | 7.5 |
| 3 | 7+ | 7.8 |
| 4 | 7 | 7.5 |
| 5 | 6+ | 6.8 |
| 6 | 7+ | 7.7 |
| 7 | 7 | 7.5 |
| 8 | 7 | 7.0 |
| 9 | 7+ | 7.9 |
In [13]:
# Drop easy charts
# merged_df = merged_df[merged_df["Chart Constant"] >= 10.5]
In [14]:
merged_df["DX Score"] = (
merged_df["DX Score"].astype(str).str.split("/").str[1].astype(int)
)
merged_df.head(3)
Out[14]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | UNiVERSE | DX | BASIC | 6 | 94.23% | AAA | 756 | 6.0 | 211.0 | 3501.0 |
| 1 | 系ぎて | maimai | BUDDiES | DX | BASIC | 7 | 96.63% | AAA | 1032 | 7.0 | 88.0 | 4250.0 |
| 2 | 廻廻奇譚 | POPS&ANIME | UNiVERSE | DX | ADVANCED | 7 | 97.65% | S | 1122 | 7.5 | 185.0 | 3354.0 |
In [15]:
# Remove the % suffix from the "Achv" column and convert it to float
merged_df["Achv"] = merged_df["Achv"].str.rstrip("%").astype(float)
merged_df.head(5)
Out[15]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | UNiVERSE | DX | BASIC | 6 | 94.23 | AAA | 756 | 6.0 | 211.0 | 3501.0 |
| 1 | 系ぎて | maimai | BUDDiES | DX | BASIC | 7 | 96.63 | AAA | 1032 | 7.0 | 88.0 | 4250.0 |
| 2 | 廻廻奇譚 | POPS&ANIME | UNiVERSE | DX | ADVANCED | 7 | 97.65 | S | 1122 | 7.5 | 185.0 | 3354.0 |
| 3 | INTERNET YAMERO | POPS&ANIME | BUDDiES PLUS | DX | ADVANCED | 7+ | 96.34 | AAA | 1167 | 7.8 | 185.0 | 4271.0 |
| 4 | INTERNET OVERDOSE | POPS&ANIME | FESTiVAL | DX | ADVANCED | 7 | 96.11 | AAA | 1068 | 7.5 | 163.0 | 3907.0 |
In [16]:
# display lowest Achv
merged_df.nsmallest(10, "Achv")
Out[16]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 186 | グリーンライツ・セレナーデ | niconico&VOCALOID™ | maimaiでらっくす | DX | EXPERT | 10 | 66.53 | B | 1839 | 10.3 | 200.0 | 2598.0 |
| 141 | バグ | niconico&VOCALOID™ | BUDDiES | DX | EXPERT | 10 | 69.89 | B | 1854 | 10.2 | 186.0 | 4082.0 |
| 342 | 炎歌 -ほむらうた- | maimai | BUDDiES PLUS | DX | EXPERT | 9+ | 81.31 | A | 1017 | 9.8 | 108.0 | 286.0 |
| 41 | FEEL the BEATS | maimai | PiNK | STD | ADVANCED | 8+ | 83.66 | A | 1485 | 8.6 | 166.0 | 1380.0 |
| 383 | Fragrance | maimai | maimai PLUS | STD | EXPERT | 12+ | 86.69 | A | 882 | 12.7 | 180.0 | 246.0 |
| 144 | セカイはまだ始まってすらいない | niconico&VOCALOID™ | FESTiVAL | DX | EXPERT | 8+ | 88.27 | A | 1134 | 8.7 | 138.0 | 3383.0 |
| 386 | ViRTUS | maimai | FESTiVAL | DX | EXPERT | 13 | 88.97 | A | 2139 | 13.2 | 225.0 | 3886.0 |
| 449 | はんぶんこ | POPS&ANIME | FESTiVAL PLUS | DX | MASTER | 12+ | 89.61 | A | 1131 | 12.9 | 124.0 | 4027.0 |
| 446 | INTERNET OVERDOSE | POPS&ANIME | FESTiVAL | DX | MASTER | 13 | 89.85 | A | 2466 | 13.5 | 163.0 | 3907.0 |
| 218 | ロストワンの号哭 | niconico&VOCALOID™ | ORANGE PLUS | STD | EXPERT | 10+ | 89.98 | A | 1635 | 10.8 | 162.0 | 981.0 |
In [17]:
# drop Achv outliners
merged_df = merged_df[(merged_df["Achv"] >= 85)]
In [18]:
# song map metrics
songs_df = Path("songs.csv")
songs_df = pd.read_csv(songs_df)
songs_df.head()
Out[18]:
| songId | type | difficulty | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | "411Ψ892" | dx | basic | 179 | 15 | 4 | 21.0 | 8 | 227 | - |
| 1 | "411Ψ892" | dx | advanced | 298 | 31 | 12 | 31.0 | 10 | 382 | - |
| 2 | "411Ψ892" | dx | expert | 416 | 53 | 52 | 25.0 | 85 | 631 | ロシェ@ペンギン |
| 3 | "411Ψ892" | dx | master | 664 | 53 | 86 | 39.0 | 103 | 945 | サファ太 vs じゃこレモン |
| 4 | #狂った民族2 PRAVARGYAZOOQA | dx | basic | 172 | 10 | 10 | 8.0 | 44 | 244 | - |
In [19]:
# player-map relation metrics
playcount_df = Path("playcount.csv")
playcount_df = pd.read_csv(playcount_df)
playcount_df.head()
Out[19]:
| Name | Version | Difficulty | Level | Achievement | Play Count | Last Played | |
|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | DX | basic | 6 | 94.2324% | 1 | 2025/01/16 00:25 |
| 1 | 系ぎて | DX | basic | 7 | 96.6252% | 1 | 2024/03/29 15:57 |
| 2 | 系ぎて | DX | advanced | 10 | 94.0313% | 1 | 2025/05/27 17:51 |
| 3 | 廻廻奇譚 | DX | advanced | 7 | 97.6466% | 1 | 2024/07/03 15:38 |
| 4 | 廻廻奇譚 | DX | expert | 9+ | 93.9536% | 1 | 2025/04/12 00:19 |
In [20]:
# Shift last played by 1 hour early due to Japan timezone
playcount_df["Last Played"] = pd.to_datetime(playcount_df["Last Played"]) - pd.Timedelta(hours=1)
# Feature engineer Year/Month/Daytime from Last Played
playcount_df["Last Played_Year"] = pd.to_datetime(playcount_df["Last Played"]).dt.year
playcount_df["Last Played_Month"] = pd.to_datetime(playcount_df["Last Played"]).dt.month
playcount_df["Last Played_Day"] = pd.to_datetime(playcount_df["Last Played"]).dt.day
# playcount_df["Last Played_Daytime"] = pd.to_datetime(playcount_df["Last Played"]).dt.hour
# playcount_df["Last Played_Daytime"] = pd.cut(
# playcount_df["Last Played_Daytime"],
# bins=[-1, 12, 18, 24],
# labels=["Morning", "Afternoon", "Night"],
# right=True,
# )
playcount_df["Last Played"] = pd.to_numeric(playcount_df["Last Played"])
playcount_df.head()
Out[20]:
| Name | Version | Difficulty | Level | Achievement | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | DX | basic | 6 | 94.2324% | 1 | 1736983500000000000 | 2025 | 1 | 15 |
| 1 | 系ぎて | DX | basic | 7 | 96.6252% | 1 | 1711724220000000000 | 2024 | 3 | 29 |
| 2 | 系ぎて | DX | advanced | 10 | 94.0313% | 1 | 1748364660000000000 | 2025 | 5 | 27 |
| 3 | 廻廻奇譚 | DX | advanced | 7 | 97.6466% | 1 | 1720017480000000000 | 2024 | 7 | 3 |
| 4 | 廻廻奇譚 | DX | expert | 9+ | 93.9536% | 1 | 1744413540000000000 | 2025 | 4 | 11 |
In [21]:
# Match columns with merged_df
playcount_df["Difficulty"] = playcount_df["Difficulty"].str.upper()
playcount_df.rename(columns={"Version": "Chart"}, inplace=True)
# Drop redundancy columns
playcount_df.drop(columns=["Achievement", "Level"], inplace=True)
In [22]:
# Merge w.r.t. Song name & Version (DX/STD) & Difficulty (basic/advanced/expert/master)
merged_df = merged_df.merge(
playcount_df,
left_on=["songId", "Chart", "Difficulty"],
right_on=["Name", "Chart", "Difficulty"],
how="left",
)
merged_df.head()
Out[22]:
| songId | Genre | Version | Chart | Difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | UNiVERSE | DX | BASIC | 6 | 94.23 | AAA | 756 | 6.0 | 211.0 | 3501.0 | 躯樹の墓守 | 1.0 | 1.736984e+18 | 2025.0 | 1.0 | 15.0 |
| 1 | 系ぎて | maimai | BUDDiES | DX | BASIC | 7 | 96.63 | AAA | 1032 | 7.0 | 88.0 | 4250.0 | 系ぎて | 1.0 | 1.711724e+18 | 2024.0 | 3.0 | 29.0 |
| 2 | 廻廻奇譚 | POPS&ANIME | UNiVERSE | DX | ADVANCED | 7 | 97.65 | S | 1122 | 7.5 | 185.0 | 3354.0 | 廻廻奇譚 | 1.0 | 1.720017e+18 | 2024.0 | 7.0 | 3.0 |
| 3 | INTERNET YAMERO | POPS&ANIME | BUDDiES PLUS | DX | ADVANCED | 7+ | 96.34 | AAA | 1167 | 7.8 | 185.0 | 4271.0 | INTERNET YAMERO | 1.0 | 1.724088e+18 | 2024.0 | 8.0 | 19.0 |
| 4 | INTERNET OVERDOSE | POPS&ANIME | FESTiVAL | DX | ADVANCED | 7 | 96.11 | AAA | 1068 | 7.5 | 163.0 | 3907.0 | INTERNET OVERDOSE | 1.0 | 1.736984e+18 | 2025.0 | 1.0 | 15.0 |
In [23]:
songs_df["songId"] = songs_df["songId"].str.replace('"', "")
merged_df.rename(columns={"Chart": "type", "Difficulty": "difficulty"}, inplace=True)
merged_df["type"] = merged_df["type"].str.lower()
merged_df["difficulty"] = merged_df["difficulty"].str.lower()
merged_df = merged_df.merge(
songs_df, on=["songId", "type", "difficulty"], how="left", copy=False
)
In [24]:
# Check new dataset for missing values
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
merged_df[merged_df.isna().any(axis=1)]
Total rows with missing values: 165 Rows with missing values:
Out[24]:
| songId | Genre | Version | type | difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | さくゆいたいそう | POPS&ANIME | BUDDiES | dx | advanced | 6+ | 99.26 | SS | 1005 | 6.8 | NaN | NaN | さくゆいたいそう | 1.0 | 1.740179e+18 | 2025.0 | 2.0 | 21.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 8 | ミラクル・ショッピング | POPS&ANIME | MiLK | std | advanced | 7 | 98.15 | S+ | 1023 | 7.0 | 167.0 | 2050.0 | NaN | NaN | NaN | NaN | NaN | NaN | 272.0 | 20.0 | 33.0 | NaN | 16.0 | 341.0 | - |
| 12 | ヒバナ | niconico&VOCALOID™ | FiNALE | std | advanced | 8 | 98.13 | S+ | 1215 | 8.2 | 200.0 | 2346.0 | NaN | NaN | NaN | NaN | NaN | NaN | 304.0 | 31.0 | 39.0 | NaN | 31.0 | 405.0 | - |
| 16 | ロキ | niconico&VOCALOID™ | FiNALE | std | advanced | 8 | 98.97 | S+ | 1101 | 8.0 | 150.0 | 2346.0 | NaN | NaN | NaN | NaN | NaN | NaN | 313.0 | 22.0 | 10.0 | NaN | 22.0 | 367.0 | - |
| 17 | パーフェクト生命 | niconico&VOCALOID™ | PiNK PLUS | std | advanced | 8 | 100.21 | SSS | 1389 | 8.1 | 200.0 | 1464.0 | NaN | NaN | NaN | NaN | NaN | NaN | 424.0 | 24.0 | 5.0 | NaN | 10.0 | 463.0 | - |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 525 | Sweets×Sweets | maimai | maimai | std | master | 11 | 96.53 | AAA | 1161 | 11.4 | 132.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 343.0 | 23.0 | 12.0 | NaN | 9.0 | 387.0 | maimai TEAM |
| 526 | ネコ日和。 | maimai | maimai | std | master | 12+ | 98.72 | S+ | 1038 | 12.6 | 171.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 283.0 | 36.0 | 11.0 | NaN | 16.0 | 346.0 | ニャイン |
| 532 | Session High⤴ | オンゲキ&CHUNITHM | MiLK PLUS | std | master | 12+ | 94.75 | AAA | 1608 | 12.8 | 200.0 | 2171.0 | NaN | NaN | NaN | NaN | NaN | NaN | 390.0 | 20.0 | 79.0 | NaN | 47.0 | 536.0 | すきやき奉行 |
| 533 | イロトリドリのメロディ | オンゲキ&CHUNITHM | FiNALE | std | master | 12+ | 94.80 | AAA | 1725 | 12.7 | 140.0 | 2431.0 | NaN | NaN | NaN | NaN | NaN | NaN | 483.0 | 27.0 | 42.0 | NaN | 23.0 | 575.0 | 華火職人 |
| 535 | Counselor | オンゲキ&CHUNITHM | PiNK | std | master | 12 | 97.02 | S | 2160 | 12.4 | 135.0 | 1246.0 | NaN | NaN | NaN | NaN | NaN | NaN | 526.0 | 69.0 | 88.0 | NaN | 37.0 | 720.0 | Techno Kitchen |
165 rows × 25 columns
In [25]:
note_count_columns = [
"noteCounts.tap",
"noteCounts.hold",
"noteCounts.slide",
"noteCounts.touch",
"noteCounts.break",
]
for col in note_count_columns:
if col in merged_df.columns:
merged_df[col] = pd.to_numeric(merged_df[col], errors="coerce")
merged_df[note_count_columns] = merged_df[note_count_columns].fillna(0)
total_notes = merged_df[note_count_columns].sum(axis=1).replace(0, 1)
merged_df["noteCounts.total"] = merged_df["DX Score"] / 3
merged_df[note_count_columns] = (
merged_df[note_count_columns]
.multiply(merged_df["noteCounts.total"] / total_notes, axis=0)
.round()
)
print(
f"\nRemaining missing noteCounts values:\n{merged_df[note_count_columns].isna().sum()}"
)
merged_df[merged_df[note_count_columns].isna().any(axis=1)]
Remaining missing noteCounts values: noteCounts.tap 0 noteCounts.hold 0 noteCounts.slide 0 noteCounts.touch 0 noteCounts.break 0 dtype: int64
Out[25]:
| songId | Genre | Version | type | difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner |
|---|
In [26]:
# Check new dataset for missing values
print(f"\nTotal rows with missing values: {merged_df.isna().any(axis=1).sum()}")
print("Rows with missing values:")
# Hide cols with no missing values
merged_df[merged_df.isna().any(axis=1)].head()
Total rows with missing values: 165 Rows with missing values:
Out[26]:
| songId | Genre | Version | type | difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | さくゆいたいそう | POPS&ANIME | BUDDiES | dx | advanced | 6+ | 99.26 | SS | 1005 | 6.8 | NaN | NaN | さくゆいたいそう | 1.0 | 1.740179e+18 | 2025.0 | 2.0 | 21.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 335.0 | NaN |
| 8 | ミラクル・ショッピング | POPS&ANIME | MiLK | std | advanced | 7 | 98.15 | S+ | 1023 | 7.0 | 167.0 | 2050.0 | NaN | NaN | NaN | NaN | NaN | NaN | 272.0 | 20.0 | 33.0 | 0.0 | 16.0 | 341.0 | - |
| 12 | ヒバナ | niconico&VOCALOID™ | FiNALE | std | advanced | 8 | 98.13 | S+ | 1215 | 8.2 | 200.0 | 2346.0 | NaN | NaN | NaN | NaN | NaN | NaN | 304.0 | 31.0 | 39.0 | 0.0 | 31.0 | 405.0 | - |
| 16 | ロキ | niconico&VOCALOID™ | FiNALE | std | advanced | 8 | 98.97 | S+ | 1101 | 8.0 | 150.0 | 2346.0 | NaN | NaN | NaN | NaN | NaN | NaN | 313.0 | 22.0 | 10.0 | 0.0 | 22.0 | 367.0 | - |
| 17 | パーフェクト生命 | niconico&VOCALOID™ | PiNK PLUS | std | advanced | 8 | 100.21 | SSS | 1389 | 8.1 | 200.0 | 1464.0 | NaN | NaN | NaN | NaN | NaN | NaN | 424.0 | 24.0 | 5.0 | 0.0 | 10.0 | 463.0 | - |
In [27]:
# SongId specific feature engineering
# * Length of songId
# * language classifier (English/Japanese)
# * HasPunctuation
merged_df["songId_length"] = merged_df["songId"].str.strip().str.len()
merged_df["songId_English"] = (
merged_df["songId"].str.count(r"[A-Za-z0-9]").fillna(0).astype(int)
)
merged_df["songId_Japanese"] = (
merged_df["songId"]
.str.count(r"[\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\uFF65-\uFF9F]")
.fillna(0)
.astype(int)
)
merged_df["songId_Punctuation"] = (
merged_df["songId"].str.count(r"[^\w\s]").fillna(0).astype(int)
)
merged_df.columns
Out[27]:
Index(['songId', 'Genre', 'Version', 'type', 'difficulty', 'Level', 'Achv',
'Rank', 'DX Score', 'Chart Constant', 'bpm', 'releaseDate', 'Name',
'Play Count', 'Last Played', 'Last Played_Year', 'Last Played_Month',
'Last Played_Day', 'noteCounts.tap', 'noteCounts.hold',
'noteCounts.slide', 'noteCounts.touch', 'noteCounts.break',
'noteCounts.total', 'noteDesigner', 'songId_length', 'songId_English',
'songId_Japanese', 'songId_Punctuation'],
dtype='object')
In [28]:
# Unique versions
print(merged_df["Version"].unique())
# Sort versions properly
versions = [
"maimai",
"maimai PLUS",
"GreeN",
"GreeN PLUS",
"ORANGE",
"ORANGE PLUS",
"PiNK",
"PiNK PLUS",
"MURASAKi",
"MURASAKi PLUS",
"MiLK",
"MiLK PLUS",
"FiNALE",
"maimaiでらっくす",
"maimaiでらっくす PLUS",
"Splash",
"Splash PLUS",
"UNiVERSE",
"UNiVERSE PLUS",
"FESTiVAL",
"FESTiVAL PLUS",
"BUDDiES",
"BUDDiES PLUS",
"PRiSM (beta)",
"PRiSM PLUS (beta)",
"CiRCLE",
# TODO: Don't hardcode this. Also, this is not needed for tree-based models.
# We use this only for linear models. If we remove Ridge, we can cut this liability.
]
version_order = {version: i + 1 for i, version in enumerate(versions)}
merged_df["Version"] = merged_df["Version"].map(version_order)
merged_df["Version"]
['UNiVERSE' 'BUDDiES' 'BUDDiES PLUS' 'FESTiVAL' 'MiLK' 'FiNALE' 'maimaiでらっくす' 'PiNK PLUS' 'MURASAKi' 'MURASAKi PLUS' 'PRiSM (beta)' 'ORANGE' 'MiLK PLUS' 'UNiVERSE PLUS' 'FESTiVAL PLUS' 'PiNK' 'ORANGE PLUS' 'maimai PLUS' 'PRiSM PLUS (beta)' 'maimaiでらっくす PLUS' 'Splash' 'Splash PLUS' 'maimai' 'CiRCLE (experimental)' 'GreeN' 'GreeN PLUS']
Out[28]:
0 18.0
1 22.0
2 18.0
3 23.0
4 20.0
...
531 17.0
532 12.0
533 13.0
534 14.0
535 7.0
Name: Version, Length: 536, dtype: float64
In [29]:
# Convert "type" into simple numeric categories
type_mapping = {"std": 0, "dx": 1}
merged_df["type"] = merged_df["type"].map(type_mapping)
merged_df.head(3)
Out[29]:
| songId | Genre | Version | type | difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner | songId_length | songId_English | songId_Japanese | songId_Punctuation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | 18.0 | 1 | basic | 6 | 94.23 | AAA | 756 | 6.0 | 211.0 | 3501.0 | 躯樹の墓守 | 1.0 | 1.736984e+18 | 2025.0 | 1.0 | 15.0 | 214.0 | 16.0 | 6.0 | 10.0 | 6.0 | 252.0 | - | 5 | 0 | 5 | 0 |
| 1 | 系ぎて | maimai | 22.0 | 1 | basic | 7 | 96.63 | AAA | 1032 | 7.0 | 88.0 | 4250.0 | 系ぎて | 1.0 | 1.711724e+18 | 2024.0 | 3.0 | 29.0 | 282.0 | 26.0 | 8.0 | 14.0 | 14.0 | 344.0 | - | 3 | 0 | 3 | 0 |
| 2 | 廻廻奇譚 | POPS&ANIME | 18.0 | 1 | advanced | 7 | 97.65 | S | 1122 | 7.5 | 185.0 | 3354.0 | 廻廻奇譚 | 1.0 | 1.720017e+18 | 2024.0 | 7.0 | 3.0 | 301.0 | 26.0 | 19.0 | 22.0 | 6.0 | 374.0 | - | 4 | 0 | 4 | 0 |
In [30]:
# Convert "difficulty" into simple numeric categories
difficulty_mapping = {
"basic": 0,
"advanced": 1,
"expert": 2,
"master": 3,
"re:master": 4,
}
merged_df["difficulty"] = merged_df["difficulty"].map(difficulty_mapping)
merged_df.head(3)
Out[30]:
| songId | Genre | Version | type | difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner | songId_length | songId_English | songId_Japanese | songId_Punctuation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 躯樹の墓守 | maimai | 18.0 | 1 | 0 | 6 | 94.23 | AAA | 756 | 6.0 | 211.0 | 3501.0 | 躯樹の墓守 | 1.0 | 1.736984e+18 | 2025.0 | 1.0 | 15.0 | 214.0 | 16.0 | 6.0 | 10.0 | 6.0 | 252.0 | - | 5 | 0 | 5 | 0 |
| 1 | 系ぎて | maimai | 22.0 | 1 | 0 | 7 | 96.63 | AAA | 1032 | 7.0 | 88.0 | 4250.0 | 系ぎて | 1.0 | 1.711724e+18 | 2024.0 | 3.0 | 29.0 | 282.0 | 26.0 | 8.0 | 14.0 | 14.0 | 344.0 | - | 3 | 0 | 3 | 0 |
| 2 | 廻廻奇譚 | POPS&ANIME | 18.0 | 1 | 1 | 7 | 97.65 | S | 1122 | 7.5 | 185.0 | 3354.0 | 廻廻奇譚 | 1.0 | 1.720017e+18 | 2024.0 | 7.0 | 3.0 | 301.0 | 26.0 | 19.0 | 22.0 | 6.0 | 374.0 | - | 4 | 0 | 4 | 0 |
In [31]:
merged_df.describe(include="all")
Out[31]:
| songId | Genre | Version | type | difficulty | Level | Achv | Rank | DX Score | Chart Constant | bpm | releaseDate | Name | Play Count | Last Played | Last Played_Year | Last Played_Month | Last Played_Day | noteCounts.tap | noteCounts.hold | noteCounts.slide | noteCounts.touch | noteCounts.break | noteCounts.total | noteDesigner | songId_length | songId_English | songId_Japanese | songId_Punctuation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 536 | 536 | 521.000000 | 536.000000 | 536.000000 | 536 | 536.000000 | 536 | 536.000000 | 536.000000 | 526.000000 | 522.000000 | 383 | 383.000000 | 3.830000e+02 | 383.000000 | 383.000000 | 383.000000 | 536.000000 | 536.000000 | 536.000000 | 536.000000 | 536.000000 | 536.000000 | 527 | 536.000000 | 536.000000 | 536.000000 | 536.000000 |
| unique | 440 | 6 | NaN | NaN | NaN | 15 | NaN | 9 | NaN | NaN | NaN | NaN | 319 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 45 | NaN | NaN | NaN | NaN |
| top | INTERNET OVERDOSE | niconico&VOCALOID™ | NaN | NaN | NaN | 12 | NaN | S+ | NaN | NaN | NaN | NaN | INTERNET OVERDOSE | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | はっぴー | NaN | NaN | NaN | NaN |
| freq | 3 | 146 | NaN | NaN | NaN | 78 | NaN | 92 | NaN | NaN | NaN | NaN | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 79 | NaN | NaN | NaN | NaN |
| mean | NaN | NaN | 17.366603 | 0.720149 | 2.055970 | NaN | 98.289291 | NaN | 1598.423507 | 10.935448 | 174.192015 | 3219.281609 | NaN | 2.561358 | 1.757252e+18 | 2025.208877 | 6.156658 | 17.389034 | 376.194030 | 51.875000 | 44.673507 | 22.535448 | 27.848881 | 532.807836 | NaN | 10.867537 | 5.227612 | 4.604478 | 0.514925 |
| std | NaN | NaN | 6.431739 | 0.449345 | 0.580559 | NaN | 2.513036 | NaN | 390.847516 | 1.465903 | 33.361900 | 1288.210058 | NaN | 2.808135 | 1.083736e+16 | 0.478024 | 3.780842 | 9.088142 | 108.806712 | 26.458312 | 30.729080 | 23.948985 | 22.903485 | 130.282505 | NaN | 8.753274 | 8.251902 | 4.833711 | 1.113956 |
| min | NaN | NaN | 1.000000 | 0.000000 | 0.000000 | NaN | 86.690000 | NaN | 426.000000 | 6.000000 | 70.000000 | 0.000000 | NaN | 1.000000 | 1.711724e+18 | 2024.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 142.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | NaN | NaN | 13.000000 | 0.000000 | 2.000000 | NaN | 97.130000 | NaN | 1329.000000 | 10.000000 | 150.000000 | 2346.000000 | NaN | 1.000000 | 1.752878e+18 | 2025.000000 | 1.000000 | 9.000000 | 311.000000 | 34.000000 | 22.000000 | 0.000000 | 11.000000 | 443.000000 | NaN | 6.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | NaN | NaN | 19.000000 | 1.000000 | 2.000000 | NaN | 99.025000 | NaN | 1590.000000 | 11.000000 | 176.000000 | 3543.000000 | NaN | 1.000000 | 1.756314e+18 | 2025.000000 | 7.000000 | 18.000000 | 378.000000 | 49.000000 | 40.000000 | 17.000000 | 22.000000 | 530.000000 | NaN | 9.000000 | 0.000000 | 4.000000 | 0.000000 |
| 75% | NaN | NaN | 23.000000 | 1.000000 | 2.000000 | NaN | 100.185000 | NaN | 1878.000000 | 12.000000 | 190.000000 | 4271.000000 | NaN | 3.000000 | 1.767130e+18 | 2025.000000 | 8.000000 | 26.000000 | 450.250000 | 69.000000 | 59.250000 | 34.250000 | 37.000000 | 626.000000 | NaN | 14.000000 | 10.000000 | 8.000000 | 1.000000 |
| max | NaN | NaN | 25.000000 | 1.000000 | 3.000000 | NaN | 101.000000 | NaN | 2958.000000 | 13.500000 | 339.000000 | 4944.000000 | NaN | 22.000000 | 1.769882e+18 | 2026.000000 | 12.000000 | 31.000000 | 726.000000 | 184.000000 | 185.000000 | 153.000000 | 180.000000 | 986.000000 | NaN | 149.000000 | 120.000000 | 28.000000 | 7.000000 |
In [32]:
# Filter out numerical columns for correlation matrix
correlation_matrix = merged_df.select_dtypes(include=["number"]).corr()
fig = plt.figure(clear=True, figsize=(14, 12))
sns.heatmap(
correlation_matrix,
annot=True,
fmt=".2f",
square=True,
cmap="coolwarm",
center=0,
linewidths=0.5,
cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix", fontsize=16, pad=20)
plt.show()
In [33]:
def correlation_rankings(
corr_mat: pd.DataFrame, target_column: str, threshold: float = 0.95
):
"""Print all abs(correlation with anything) that exceeds a threshold"""
output = []
for col in corr_mat.columns:
if col == target_column:
continue
strong_correlations = (
corr_mat[col][corr_mat[col].abs() > threshold]
.drop(col)
.sort_values(ascending=False, key=lambda x: x.abs())
)
if not strong_correlations.empty:
formatted_corrs: list = []
for corr_col, value in strong_correlations.items():
color: str = "bold green" if value > 0 else "bold red"
formatted_corrs.append(f"[{color}]{corr_col} ({value:.2f})[/{color}]")
corr_with_target = corr_mat.loc[col, target_column]
correlations_str = ", ".join(formatted_corrs)
header = "".join(
[
f"[bold sky_blue1]{col} ({target_column}: {corr_with_target:.2f}, ",
f"count: {len(strong_correlations)}):[/bold sky_blue1]",
]
)
full_line = f"{header} [{correlations_str}]"
output.append(full_line)
output.sort(
key=lambda x: float(x.split(f"({target_column}: ")[1].split(",")[0]),
reverse=True,
)
if output:
rich.print("\n".join(output))
correlation_rankings(correlation_matrix, target_column="Achv")
DX Score (Achv: -0.07, count: 1): [noteCounts.total (1.00)] noteCounts.total (Achv: -0.07, count: 1): [DX Score (1.00)]
In [34]:
# Drop collinear columns
merged_df.drop(
columns=[
"DX Score",
],
inplace=True,
)
correlation_matrix = merged_df.select_dtypes(include=["number"]).corr()
correlation_rankings(correlation_matrix, target_column="Achv")
fig = plt.figure(clear=True, figsize=(14, 12))
sns.heatmap(
correlation_matrix,
annot=True,
fmt=".2f",
square=True,
cmap="coolwarm",
center=0,
linewidths=0.5,
cbar_kws={"shrink": 0.8},
)
plt.title("Correlation Matrix (Reduced)", fontsize=16, pad=20)
plt.show()
In [35]:
# Pairwise correlation with seaborn
sns.pairplot(
merged_df,
diag_kind="kde",
plot_kws={"alpha": 0.4, "s": 15},
corner=True,
height=1.7,
)
plt.show()
Regression
In [36]:
print(merged_df.columns)
numerical_features = (
merged_df.select_dtypes(include=["number"]).columns.drop("Achv").tolist()
)
categorical_features = ["Genre"]
drop_columns = [
feature
for feature in merged_df.columns
if feature not in numerical_features + categorical_features + ["Achv"]
]
print(f"Dropping columns: {drop_columns}")
merged_df.drop(
columns=drop_columns,
inplace=True,
)
merged_df.dropna(inplace=True)
Index(['songId', 'Genre', 'Version', 'type', 'difficulty', 'Level', 'Achv',
'Rank', 'Chart Constant', 'bpm', 'releaseDate', 'Name', 'Play Count',
'Last Played', 'Last Played_Year', 'Last Played_Month',
'Last Played_Day', 'noteCounts.tap', 'noteCounts.hold',
'noteCounts.slide', 'noteCounts.touch', 'noteCounts.break',
'noteCounts.total', 'noteDesigner', 'songId_length', 'songId_English',
'songId_Japanese', 'songId_Punctuation'],
dtype='object')
Dropping columns: ['songId', 'Level', 'Rank', 'Name', 'noteDesigner']
In [37]:
# merged_df.to_csv("maimai.cleaned.csv", index=False)
In [58]:
merged_df.sort_values(by="Last Played", inplace=True)
testset_size = 0.5
X_train, X_test = merged_df.iloc[
: int(len(merged_df) * (1 - testset_size)), :
], merged_df.iloc[
int(len(merged_df) * (1 - testset_size)) :, :
]
y_train, y_test = X_train.pop("Achv"), X_test.pop("Achv")
In [59]:
# Visualize distribution of Achv
plt.figure(figsize=(6, 3))
sns.histplot(y_train, bins=100, kde=True)
plt.title("Distribution of y_train")
plt.xlabel("Achv (%)")
plt.show()
TRANSFORM_ACHV = ["box-cox", "log", "none"][0]
if TRANSFORM_ACHV == "box-cox":
# Convert non-linear Achv score to a (almost) gaussian like distribution
y_train, achv_boxcox_lambda = stats.boxcox(y_train)
y_test = stats.boxcox(y_test, lmbda=achv_boxcox_lambda)
# The target variable now has crazy large values, so we normalize it
scaler = StandardScaler()
y_train = scaler.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test = scaler.transform(y_test.reshape(-1, 1)).flatten()
print(f"Achv Box-Cox Lambda: {achv_boxcox_lambda}")
elif TRANSFORM_ACHV == "log":
y_train = np.log(y_train / (101 - y_train))
y_test = np.log(y_test / (101 - y_test))
else:
pass
plt.figure(figsize=(6, 3))
sns.histplot(y_train, bins=100, kde=True)
plt.title(f"Distribution of y_train after {TRANSFORM_ACHV} Transformation")
plt.xlabel("Achv")
plt.ylabel("Frequency")
plt.show()
def Achv_inverse_transform(y_transformed):
if TRANSFORM_ACHV == "box-cox":
# Inverse Box-Cox transformation
y = np.asarray(y_transformed).reshape(-1, 1)
y = scaler.inverse_transform(y).flatten()
y = scipy.special.inv_boxcox(y, achv_boxcox_lambda)
elif TRANSFORM_ACHV == "log":
y = 101 / (1 + np.exp(-y_transformed))
else:
y = y_transformed
return y
Achv Box-Cox Lambda: 27.019015925780376
In [60]:
preprocessor = ColumnTransformer(
transformers=[
(
"num",
Pipeline(
steps=[
(
"poly",
PolynomialFeatures(
degree=1, include_bias=False, interaction_only=False
),
),
("scaler", StandardScaler()),
]
),
numerical_features,
),
(
"cat",
OneHotEncoder(
sparse_output=False,
handle_unknown="error",
categories=[
merged_df[column].unique().tolist()
for column in categorical_features
],
),
categorical_features,
),
]
)
pipeline = Pipeline(
steps=[
("preprocessor", preprocessor),
(
"regressor",
ElasticNetCV(
max_iter=100000,
l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],
random_state=RANDOM_STATE,
cv=TimeSeriesSplit(n_splits=5),
),
),
]
)
pipeline.fit(X_train, y_train)
chosen_alpha = pipeline.named_steps["regressor"].alpha_
print("Chosen alpha:", chosen_alpha)
l1_ratio = pipeline.named_steps["regressor"].l1_ratio_
print("Chosen l1_ratio:", l1_ratio)
Chosen alpha: 0.06262986919313132 Chosen l1_ratio: 1.0
In [68]:
y_train_pred = pipeline.predict(X_train)
train_r2 = r2_score(y_train, y_train_pred)
train_mse = mean_absolute_error(y_train, y_train_pred)
train_mse_transformed = mean_absolute_error(
Achv_inverse_transform(y_train), Achv_inverse_transform(y_train_pred)
)
print("Train set performance:")
print(f"R^2 Score: {train_r2:.4f}")
print(f"Mean Absolute Error: {train_mse:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {train_mse_transformed:.4f}")
y_test_pred = pipeline.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_absolute_error(y_test, y_test_pred)
test_mse_transformed = mean_absolute_error(
Achv_inverse_transform(y_test), Achv_inverse_transform(y_test_pred)
)
print("Test set performance:")
print(f"R^2 Score: {test_r2:.4f}")
print(f"Mean Absolute Error: {test_mse:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {test_mse_transformed:.4f}")
cat_encoder = pipeline.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
all_feature_names = (
list(cat_feature_names)
+ pipeline.named_steps["preprocessor"]
.named_transformers_["num"]
.named_steps["poly"]
.get_feature_names_out(numerical_features)
.tolist()
)
regressor_step = pipeline.named_steps["regressor"]
results_df = pd.DataFrame(
{
"Feature": [all_feature_names[i] for i in range(len(all_feature_names))],
"Coefficient": regressor_step.coef_
}
).sort_values(by="Coefficient", ascending=False, key=abs)
results_df = results_df[results_df["Coefficient"] != 0]
display(results_df)
Train set performance: R^2 Score: 0.5654 Mean Absolute Error: 0.5362 Mean Absolute Error (Inverse Transformed): 1.1676 Test set performance: R^2 Score: 0.3839 Mean Absolute Error: 0.6909 Mean Absolute Error (Inverse Transformed): 1.3568
| Feature | Coefficient | |
|---|---|---|
| 3 | Genre_東方Project | -0.571551 |
| 7 | type | 0.480025 |
| 9 | Chart Constant | 0.320098 |
| 6 | Version | 0.169017 |
| 0 | Genre_niconico&VOCALOID™ | 0.102858 |
| 19 | noteCounts.slide | 0.019177 |
| 4 | Genre_GAME&VARIETY | 0.003423 |
In [69]:
y_train_inv = Achv_inverse_transform(y_train)
y_train_pred_inv = Achv_inverse_transform(y_train_pred)
y_test_inv = Achv_inverse_transform(y_test)
y_test_pred_inv = Achv_inverse_transform(y_test_pred)
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Common scatter kwargs
scatter_kws = {"alpha": 0.6, "s": 40, "edgecolor": "white", "linewidth": 0.5}
line_kws = {"color": "#333333", "linestyle": "--", "linewidth": 2, "alpha": 0.8}
# Predicted vs Actual for training set
axes[0].scatter(y_train_inv, y_train_pred_inv, **scatter_kws)
axes[0].plot(
[y_train_inv.min(), y_train_inv.max()],
[y_train_inv.min(), y_train_inv.max()],
**line_kws,
)
axes[0].set_xlabel("Actual Achv")
axes[0].set_ylabel("Predicted Achv")
axes[0].set_title("Training Set: Predicted vs Actual")
# Predicted vs Actual for test set
axes[1].scatter(y_test_inv, y_test_pred_inv, color="C1", **scatter_kws)
axes[1].plot(
[y_test_inv.min(), y_test_inv.max()],
[y_test_inv.min(), y_test_inv.max()],
**line_kws,
)
axes[1].set_xlabel("Actual Achv")
axes[1].set_ylabel("Predicted Achv")
axes[1].set_title("Test Set: Predicted vs Actual")
# Residual QQ plot
residuals = y_test - y_test_pred
stats.probplot(residuals, dist="norm", plot=axes[2])
axes[2].get_lines()[0].set_markerfacecolor("C2")
axes[2].get_lines()[0].set_markeredgecolor("white")
axes[2].get_lines()[0].set_alpha(0.6)
axes[2].get_lines()[1].set_color("#333333")
axes[2].get_lines()[1].set_linewidth(2)
axes[2].set_title("QQ Plot of Residuals")
plt.tight_layout()
plt.show()
In [90]:
preprocessor_rf = ColumnTransformer(
transformers=[
("num", "passthrough", numerical_features),
(
"cat",
OneHotEncoder(
sparse_output=True,
handle_unknown="error",
categories=[
merged_df[column].unique().tolist()
for column in categorical_features
],
),
categorical_features,
),
]
)
pipeline_rf = Pipeline(
steps=[
("preprocessor", preprocessor_rf),
(
"rfr",
RandomForestRegressor(
random_state=RANDOM_STATE,
),
),
]
)
param_grid = {
"rfr__max_depth": np.arange(2, 5, 1),
"rfr__min_samples_leaf": np.arange(2, 50, 1),
"rfr__ccp_alpha": np.arange(0, 0.04, 0.001),
}
grid_search = RandomizedSearchCV(
pipeline_rf,
param_grid,
scoring="r2",
n_jobs=-1,
verbose=10,
random_state=RANDOM_STATE,
n_iter=100,
cv=TimeSeriesSplit(n_splits=10),
)
grid_search.fit(X_train, y_train)
pipeline_rf = grid_search.best_estimator_
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV R^2: {grid_search.best_score_:.4f}")
Fitting 10 folds for each of 100 candidates, totalling 1000 fits
Best parameters: {'rfr__min_samples_leaf': np.int64(4), 'rfr__max_depth': np.int64(4), 'rfr__ccp_alpha': np.float64(0.003)}
Best CV R^2: -0.1283
In [91]:
y_train_pred_rf = pipeline_rf.predict(X_train)
train_r2_rf = r2_score(y_train, y_train_pred_rf)
train_mse_rf = mean_absolute_error(y_train, y_train_pred_rf)
train_mse_rf_transformed = mean_absolute_error(
Achv_inverse_transform(y_train), Achv_inverse_transform(y_train_pred_rf)
)
print("Train set performance:")
print(f"R^2 Score: {train_r2_rf:.4f}")
print(f"Mean Absolute Error: {train_mse_rf:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {train_mse_rf_transformed:.4f}")
y_test_pred_rf = pipeline_rf.predict(X_test)
test_r2_rf = r2_score(y_test, y_test_pred_rf)
test_mse_rf = mean_absolute_error(y_test, y_test_pred_rf)
test_mse_rf_transformed = mean_absolute_error(
Achv_inverse_transform(y_test), Achv_inverse_transform(y_test_pred_rf)
)
print("Test set performance:")
print(f"R^2 Score: {test_r2_rf:.4f}")
print(f"Mean Absolute Error: {test_mse_rf:.4f}")
print(f"Mean Absolute Error (Inverse Transformed): {test_mse_rf_transformed:.4f}")
Train set performance: R^2 Score: 0.7257 Mean Absolute Error: 0.4245 Mean Absolute Error (Inverse Transformed): 0.9093 Test set performance: R^2 Score: 0.4535 Mean Absolute Error: 0.7243 Mean Absolute Error (Inverse Transformed): 1.3905
In [92]:
y_train_inv_rf = Achv_inverse_transform(y_train)
y_train_pred_inv_rf = Achv_inverse_transform(y_train_pred_rf)
y_test_inv_rf = Achv_inverse_transform(y_test)
y_test_pred_inv_rf = Achv_inverse_transform(y_test_pred_rf)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# Common scatter kwargs
scatter_kws = {"alpha": 0.6, "s": 40, "edgecolor": "white", "linewidth": 0.5}
line_kws = {"color": "#333333", "linestyle": "--", "linewidth": 2, "alpha": 0.8}
# 1. Predicted vs Actual for training set
axes[0].scatter(y_train_inv_rf, y_train_pred_inv_rf, color="C0", **scatter_kws)
axes[0].plot(
[y_train_inv_rf.min(), y_train_inv_rf.max()],
[y_train_inv_rf.min(), y_train_inv_rf.max()],
**line_kws,
)
axes[0].set_xlabel("Actual Achv")
axes[0].set_ylabel("Predicted Achv")
axes[0].set_title("Training Set: Predicted vs Actual (RF)")
# 2. Predicted vs Actual for test set
axes[1].scatter(y_test_inv_rf, y_test_pred_inv_rf, color="C1", **scatter_kws)
axes[1].plot(
[y_test_inv_rf.min(), y_test_inv_rf.max()],
[y_test_inv_rf.min(), y_test_inv_rf.max()],
**line_kws,
)
axes[1].set_xlabel("Actual Achv")
axes[1].set_ylabel("Predicted Achv")
axes[1].set_title("Test Set: Predicted vs Actual (RF)")
plt.tight_layout()
plt.show()
In [93]:
rf_features_step = pipeline_rf.named_steps["preprocessor"].named_transformers_["num"]
rf_numeric_feature_names = rf_features_step.get_feature_names_out(numerical_features)
cat_encoder_rf = pipeline_rf.named_steps["preprocessor"].named_transformers_["cat"]
cat_feature_names_rf = cat_encoder_rf.get_feature_names_out(categorical_features)
all_feature_names_rf = list(rf_numeric_feature_names) + list(cat_feature_names_rf)
rf_model = pipeline_rf.named_steps["rfr"]
# Calculate Split Counts (Frequency)
n_features = len(all_feature_names_rf)
split_counts = np.zeros(n_features, dtype=int)
for tree in rf_model.estimators_:
tree_features = tree.tree_.feature
# Filter out leaf nodes (indicated by -2)
indices = tree_features[tree_features >= 0]
np.add.at(split_counts, indices, 1)
# Create DataFrame with Importance and Splits
importance_df = pd.DataFrame(
{
"Feature": all_feature_names_rf,
"Importance": rf_model.feature_importances_,
"Splits": split_counts,
}
).sort_values(by="Importance", ascending=False)
# Filter for visualization
useful_features = importance_df[importance_df["Importance"] > 0].copy()
# Create custom labels: "Feature (Importance | Splits)"
useful_features["Label"] = useful_features.apply(
lambda x: f"{x['Importance']:.3f} ({int(x['Splits'])})", axis=1
)
display(useful_features)
# Visualization
fig, ax = plt.subplots(figsize=(14, len(useful_features) * 0.4 + 2))
# Create gradient colors based on Importance
norm = plt.Normalize(useful_features["Importance"].min(), useful_features["Importance"].max())
colors = plt.cm.viridis(norm(useful_features["Importance"]))
bars = ax.barh(
range(len(useful_features)),
useful_features["Importance"],
color=colors,
edgecolor="none",
height=0.7
)
# Add text labels inside or next to bars
for i, (bar, label) in enumerate(zip(bars, useful_features["Label"])):
width = bar.get_width()
ax.text(
width,
bar.get_y() + bar.get_height() / 2,
f" {label}",
va="center",
ha="left",
fontsize=10,
color="#333333",
fontweight="medium"
)
ax.set_yticks(range(len(useful_features)))
ax.set_yticklabels(useful_features["Feature"], fontsize=11)
ax.set_xlabel("Importance (Gain)", fontsize=12)
ax.set_title("Feature Importances (Gain) and Split Counts", fontsize=15, pad=15)
ax.invert_yaxis()
# Remove top and right spines for cleaner look
sns.despine()
plt.tight_layout()
plt.show()
| Feature | Importance | Splits | Label | |
|---|---|---|---|---|
| 7 | Last Played | 0.340450 | 175 | 0.340 (175) |
| 3 | Chart Constant | 0.294145 | 209 | 0.294 (209) |
| 6 | Play Count | 0.062840 | 63 | 0.063 (63) |
| 13 | noteCounts.slide | 0.046182 | 84 | 0.046 (84) |
| 9 | Last Played_Month | 0.033795 | 23 | 0.034 (23) |
| 12 | noteCounts.hold | 0.031515 | 53 | 0.032 (53) |
| 16 | noteCounts.total | 0.028551 | 35 | 0.029 (35) |
| 0 | Version | 0.025488 | 31 | 0.025 (31) |
| 15 | noteCounts.break | 0.020023 | 52 | 0.020 (52) |
| 4 | bpm | 0.019168 | 59 | 0.019 (59) |
| 5 | releaseDate | 0.018831 | 53 | 0.019 (53) |
| 11 | noteCounts.tap | 0.015146 | 26 | 0.015 (26) |
| 19 | songId_Japanese | 0.013148 | 38 | 0.013 (38) |
| 2 | difficulty | 0.012306 | 15 | 0.012 (15) |
| 14 | noteCounts.touch | 0.009308 | 33 | 0.009 (33) |
| 18 | songId_English | 0.006773 | 12 | 0.007 (12) |
| 20 | songId_Punctuation | 0.006287 | 10 | 0.006 (10) |
| 10 | Last Played_Day | 0.005070 | 21 | 0.005 (21) |
| 23 | Genre_POPS&ANIME | 0.004546 | 16 | 0.005 (16) |
| 17 | songId_length | 0.003153 | 10 | 0.003 (10) |
| 21 | Genre_niconico&VOCALOID™ | 0.002195 | 8 | 0.002 (8) |
| 25 | Genre_GAME&VARIETY | 0.000928 | 2 | 0.001 (2) |
| 22 | Genre_maimai | 0.000154 | 1 | 0.000 (1) |
C:\Users\123er\AppData\Local\Temp\ipykernel_7824\3721033785.py:77: UserWarning: Glyph 65286 (\N{FULLWIDTH AMPERSAND}) missing from font(s) Arial.
plt.tight_layout()
d:\Personal Data\Repositories\personal-repo\maimai-dx-regression\.venv\Lib\site-packages\IPython\core\pylabtools.py:170: UserWarning: Glyph 65286 (\N{FULLWIDTH AMPERSAND}) missing from font(s) Arial.
fig.canvas.print_figure(bytes_io, **kw)
In [94]:
# Show the test set predictions and actual scores
test_results_df = pd.DataFrame(
{
"Actual Achv": Achv_inverse_transform(y_test),
"Predicted Achv (ElasticNet)": Achv_inverse_transform(y_test_pred),
"Predicted Achv (RandomForest)": Achv_inverse_transform(y_test_pred_rf),
}
)
display(test_results_df)
| Actual Achv | Predicted Achv (ElasticNet) | Predicted Achv (RandomForest) | |
|---|---|---|---|
| 0 | 98.36 | 98.032286 | 96.445536 |
| 1 | 100.30 | 98.757396 | 99.157519 |
| 2 | 98.59 | 98.408814 | 98.791806 |
| 3 | 98.19 | 98.552006 | 98.951504 |
| 4 | 100.57 | 99.991425 | 99.726866 |
| ... | ... | ... | ... |
| 175 | 100.71 | 99.864643 | 99.722681 |
| 176 | 94.54 | 96.308191 | 96.253539 |
| 177 | 96.17 | 97.680423 | 97.167720 |
| 178 | 95.30 | 96.507082 | 96.282976 |
| 179 | 94.37 | 96.097244 | 96.287665 |
180 rows × 3 columns