Fetch and parse ACS benchmark results under a given directory
Each ACS benchmark run outputs a json file. This script collects all such files under a given root directory, parses them, and aggregates them into a more easily digestable pandas DataFrame.
[1]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
[Action required] Set RESULTS_ROOT_DIR
to the root results directory path:
[2]:
RESULTS_ROOT_DIR = Path("/fast/groups/sf") / "folktexts-results" / "2024-08-28_2"
Set the local path to the root data directory (needed only to train baseline ML methods):
[3]:
DATA_DIR = Path("/fast/groups/sf") / "data"
Important results columns:
[4]:
model_col = "config_model_name"
task_col = "config_task_name"
numeric_prompt_col = "config_numeric_risk_prompting"
feature_subset_col = "config_feature_subset"
predictions_path_col = "predictions_path"
Helper function to parse each dictionary containing benchmark results:
[5]:
from utils import (
num_features_helper,
parse_model_name,
get_non_instruction_tuned_name,
prettify_model_name,
)
def parse_results_dict(dct) -> dict:
"""Parses results dict and brings all information to the top-level."""
# Make a copy so we don't modify the input object
dct = dct.copy()
# Discard plots' paths
dct.pop("plots", None)
# Bring configs to top-level
config = dct.pop("config", {})
for key, val in config.items():
dct[f"config_{key}"] = val
# Parse model name
dct[model_col] = parse_model_name(dct[model_col])
dct["base_name"] = get_non_instruction_tuned_name(dct[model_col])
dct["name"] = prettify_model_name(dct[model_col])
# Is instruction-tuned model?
dct["is_inst"] = dct["base_name"] != dct[model_col] or "(it)" in dct["name"].lower()
# Log number of features
dct["num_features"] = num_features_helper(dct[feature_subset_col], max_features_return=-1)
dct["uses_all_features"] = (dct[feature_subset_col] is None) or (dct["num_features"] == -1)
if dct[feature_subset_col] is None:
dct[feature_subset_col] = "full"
# Assert all results are at the top-level
assert not any(isinstance(val, dict) for val in dct.values())
return dct
Iteratively search the root directory for results files matching the given regex:
[6]:
from utils import find_files, load_json
# Results file name pattern
pattern = r'^results.bench-(?P<hash>\d+)[.]json$'
# Find results files and aggregate
results = {}
for file_path in tqdm(find_files(RESULTS_ROOT_DIR, pattern)):
results[Path(file_path).parent.name] = parse_results_dict(load_json(file_path))
if len(results) == 0:
raise RuntimeError(f"Couldn't find any results at {RESULTS_ROOT_DIR}")
else:
print(f"Found {len(results)} benchmark results.")
Found 241 benchmark results.
Aggregate results into a single DataFrame, generate a unique identifier for each row, and drop potential duplicates:
[7]:
df = pd.DataFrame(list(results.values()))
def row_id(row) -> str:
"""Unique row identifier."""
numeric_or_multiple_choice = "Num" if row[numeric_prompt_col] else "QA"
return f"{row[model_col]}__{row[task_col]}__{row['num_features']}__{numeric_or_multiple_choice}"
print(f"{df.shape=}")
df["id"] = df.apply(row_id, axis=1)
# Drop duplicates
len_with_dups = len(df)
df = df.drop_duplicates(subset=["name", "is_inst", "num_features", task_col, numeric_prompt_col])
df = df.set_index("id", drop=True, verify_integrity=True)
if len_with_dups != len(df):
print(f"Dropping {len_with_dups - len(df)} duplicates!")
print(f"{df.shape=}")
df.shape=(241, 58)
Dropping 31 duplicates!
df.shape=(210, 58)
Load scores DFs and analyze score distribution:
[8]:
def load_model_scores_df(df_row: pd.Series) -> pd.DataFrame:
"""Loads csv containing model scores corresponding to the given DF row."""
if predictions_path_col in df_row and not pd.isna(df_row[predictions_path_col]):
return pd.read_csv(df_row[predictions_path_col], index_col=0)
return None
[9]:
from folktexts.evaluation import compute_best_threshold
from sklearn import metrics
from scipy import stats
# Number of samples used to fit the one-parameter binarization threshold!
N = 100
fit_thresh_col = f"fit_thresh_on_{N}"
fit_acc_col = f"fit_thresh_accuracy"
optimal_thres_col = "optimal_thresh"
optimal_acc_col = "optimal_thresh_accuracy"
score_stdev_col = "score_stdev"
score_mean_col = "score_mean"
scores_stats = dict()
for row_id, row in tqdm(df.iterrows(), total=len(df)):
# Load model scores
scores_df = load_model_scores_df(row)
if scores_df is None:
logging.error(f"Couldn't find scores for {row_id}")
# Extract scores and labels
risk_scores = scores_df["risk_score"].to_numpy()
labels = scores_df["label"].to_numpy()
# Sample N rows to fit threshold
scores_df_sample = scores_df.sample(n=N, random_state=42)
# Compute optimal threshold on each data sample
fit_thr = compute_best_threshold(y_true=scores_df_sample["label"], y_pred_scores=scores_df_sample["risk_score"])
opt_thr = compute_best_threshold(y_true=labels, y_pred_scores=risk_scores)
# Evaluate accuracy
fit_acc = metrics.accuracy_score(labels, (risk_scores >= fit_thr).astype(int))
opt_acc = metrics.accuracy_score(labels, (risk_scores >= opt_thr).astype(int))
# Save results
scores_stats[row_id] = {
fit_thresh_col: fit_thr,
fit_acc_col: fit_acc,
optimal_thres_col: opt_thr,
optimal_acc_col: opt_acc,
score_stdev_col: np.std(risk_scores),
score_mean_col: np.mean(risk_scores),
}
Update results DF with scores statistics:
[10]:
scores_stats_df = pd.DataFrame(scores_stats.values(), index=list(scores_stats.keys()))
print(f"{scores_stats_df.shape=}")
results_df = pd.concat((df, scores_stats_df), axis="columns")
results_df.sample(2)
scores_stats_df.shape=(210, 6)
[10]:
accuracy | accuracy_diff | accuracy_ratio | balanced_accuracy | balanced_accuracy_diff | balanced_accuracy_ratio | brier_score_loss | ece | ece_quantile | equalized_odds_diff | ... | name | is_inst | num_features | uses_all_features | fit_thresh_on_100 | fit_thresh_accuracy | optimal_thresh | optimal_thresh_accuracy | score_stdev | score_mean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
gemma-2-27b__ACSTravelTime__-1__Num | 0.539392 | 0.258000 | 0.656000 | 0.504500 | 0.100184 | 0.823204 | 0.388077 | 0.347071 | NaN | 0.221264 | ... | Gemma 2 27B | False | -1 | True | 0.13000 | 0.514676 | 0.210000 | 0.524692 | 0.311014 | 0.219864 |
gemma-1.1-2b-it__ACSEmployment__-1__QA | 0.462874 | 0.235384 | 0.631881 | 0.427372 | 0.091640 | 0.800718 | 0.408398 | 0.382143 | 0.376308 | 0.212855 | ... | Gemma 2B (it) | True | -1 | True | 0.02033 | 0.482054 | 0.007577 | 0.485395 | 0.232183 | 0.216562 |
2 rows × 64 columns
Check if any results are missing:
[11]:
experiments_per_model_task_pair = results_df.groupby([model_col, task_col]).nunique().max(axis=None)
for m in results_df[model_col].unique():
for t in results_df[task_col].unique():
match_ = results_df[(results_df[model_col] == m) & (results_df[task_col] == t)]
if len(match_) < experiments_per_model_task_pair:
print(f"Couldn't find all results for m={m}, t={t} ({len(match_)} < {experiments_per_model_task_pair})")
Finally, save results DF to the results root directory:
[12]:
from utils import get_current_timestamp
results_df.to_csv(Path(RESULTS_ROOT_DIR) / f"aggregated_results.{get_current_timestamp()}.csv")