Fetch and parse ACS benchmark results under a given directory

Each ACS benchmark run outputs a json file. This script collects all such files under a given root directory, parses them, and aggregates them into a more easily digestable pandas DataFrame.

[1]:
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

[Action required] Set RESULTS_ROOT_DIR to the root results directory path:

[2]:
RESULTS_ROOT_DIR = Path("/fast/groups/sf") / "folktexts-results" / "2024-08-28_2"

Set the local path to the root data directory (needed only to train baseline ML methods):

[3]:
DATA_DIR = Path("/fast/groups/sf") / "data"

Important results columns:

[4]:
model_col = "config_model_name"
task_col = "config_task_name"
numeric_prompt_col = "config_numeric_risk_prompting"

feature_subset_col = "config_feature_subset"
predictions_path_col = "predictions_path"

Helper function to parse each dictionary containing benchmark results:

[5]:
from utils import (
    num_features_helper,
    parse_model_name,
    get_non_instruction_tuned_name,
    prettify_model_name,
)

def parse_results_dict(dct) -> dict:
    """Parses results dict and brings all information to the top-level."""

    # Make a copy so we don't modify the input object
    dct = dct.copy()

    # Discard plots' paths
    dct.pop("plots", None)

    # Bring configs to top-level
    config = dct.pop("config", {})
    for key, val in config.items():
        dct[f"config_{key}"] = val

    # Parse model name
    dct[model_col] = parse_model_name(dct[model_col])
    dct["base_name"] = get_non_instruction_tuned_name(dct[model_col])
    dct["name"] = prettify_model_name(dct[model_col])

    # Is instruction-tuned model?
    dct["is_inst"] = dct["base_name"] != dct[model_col] or "(it)" in dct["name"].lower()

    # Log number of features
    dct["num_features"] = num_features_helper(dct[feature_subset_col], max_features_return=-1)
    dct["uses_all_features"] = (dct[feature_subset_col] is None) or (dct["num_features"] == -1)

    if dct[feature_subset_col] is None:
        dct[feature_subset_col] = "full"

    # Assert all results are at the top-level
    assert not any(isinstance(val, dict) for val in dct.values())
    return dct

Iteratively search the root directory for results files matching the given regex:

[6]:
from utils import find_files, load_json

# Results file name pattern
pattern = r'^results.bench-(?P<hash>\d+)[.]json$'

# Find results files and aggregate
results = {}
for file_path in tqdm(find_files(RESULTS_ROOT_DIR, pattern)):
    results[Path(file_path).parent.name] = parse_results_dict(load_json(file_path))

if len(results) == 0:
    raise RuntimeError(f"Couldn't find any results at {RESULTS_ROOT_DIR}")
else:
    print(f"Found {len(results)} benchmark results.")
Found 241 benchmark results.

Aggregate results into a single DataFrame, generate a unique identifier for each row, and drop potential duplicates:

[7]:
df = pd.DataFrame(list(results.values()))

def row_id(row) -> str:
    """Unique row identifier."""
    numeric_or_multiple_choice = "Num" if row[numeric_prompt_col] else "QA"
    return f"{row[model_col]}__{row[task_col]}__{row['num_features']}__{numeric_or_multiple_choice}"

print(f"{df.shape=}")
df["id"] = df.apply(row_id, axis=1)

# Drop duplicates
len_with_dups = len(df)
df = df.drop_duplicates(subset=["name", "is_inst", "num_features", task_col, numeric_prompt_col])
df = df.set_index("id", drop=True, verify_integrity=True)

if len_with_dups != len(df):
    print(f"Dropping {len_with_dups - len(df)} duplicates!")
    print(f"{df.shape=}")
df.shape=(241, 58)
Dropping 31 duplicates!
df.shape=(210, 58)

Load scores DFs and analyze score distribution:

[8]:
def load_model_scores_df(df_row: pd.Series) -> pd.DataFrame:
    """Loads csv containing model scores corresponding to the given DF row."""
    if predictions_path_col in df_row and not pd.isna(df_row[predictions_path_col]):
        return pd.read_csv(df_row[predictions_path_col], index_col=0)
    return None
[9]:
from folktexts.evaluation import compute_best_threshold
from sklearn import metrics
from scipy import stats

# Number of samples used to fit the one-parameter binarization threshold!
N = 100

fit_thresh_col = f"fit_thresh_on_{N}"
fit_acc_col = f"fit_thresh_accuracy"

optimal_thres_col = "optimal_thresh"
optimal_acc_col = "optimal_thresh_accuracy"

score_stdev_col = "score_stdev"
score_mean_col = "score_mean"

scores_stats = dict()
for row_id, row in tqdm(df.iterrows(), total=len(df)):

    # Load model scores
    scores_df = load_model_scores_df(row)
    if scores_df is None:
        logging.error(f"Couldn't find scores for {row_id}")

    # Extract scores and labels
    risk_scores = scores_df["risk_score"].to_numpy()
    labels = scores_df["label"].to_numpy()

    # Sample N rows to fit threshold
    scores_df_sample = scores_df.sample(n=N, random_state=42)

    # Compute optimal threshold on each data sample
    fit_thr = compute_best_threshold(y_true=scores_df_sample["label"], y_pred_scores=scores_df_sample["risk_score"])
    opt_thr = compute_best_threshold(y_true=labels, y_pred_scores=risk_scores)

    # Evaluate accuracy
    fit_acc = metrics.accuracy_score(labels, (risk_scores >= fit_thr).astype(int))
    opt_acc = metrics.accuracy_score(labels, (risk_scores >= opt_thr).astype(int))

    # Save results
    scores_stats[row_id] = {
        fit_thresh_col: fit_thr,
        fit_acc_col: fit_acc,
        optimal_thres_col: opt_thr,
        optimal_acc_col: opt_acc,
        score_stdev_col: np.std(risk_scores),
        score_mean_col: np.mean(risk_scores),
    }

Update results DF with scores statistics:

[10]:
scores_stats_df = pd.DataFrame(scores_stats.values(), index=list(scores_stats.keys()))
print(f"{scores_stats_df.shape=}")

results_df = pd.concat((df, scores_stats_df), axis="columns")
results_df.sample(2)
scores_stats_df.shape=(210, 6)
[10]:
accuracy accuracy_diff accuracy_ratio balanced_accuracy balanced_accuracy_diff balanced_accuracy_ratio brier_score_loss ece ece_quantile equalized_odds_diff ... name is_inst num_features uses_all_features fit_thresh_on_100 fit_thresh_accuracy optimal_thresh optimal_thresh_accuracy score_stdev score_mean
gemma-2-27b__ACSTravelTime__-1__Num 0.539392 0.258000 0.656000 0.504500 0.100184 0.823204 0.388077 0.347071 NaN 0.221264 ... Gemma 2 27B False -1 True 0.13000 0.514676 0.210000 0.524692 0.311014 0.219864
gemma-1.1-2b-it__ACSEmployment__-1__QA 0.462874 0.235384 0.631881 0.427372 0.091640 0.800718 0.408398 0.382143 0.376308 0.212855 ... Gemma 2B (it) True -1 True 0.02033 0.482054 0.007577 0.485395 0.232183 0.216562

2 rows × 64 columns

Check if any results are missing:

[11]:
experiments_per_model_task_pair = results_df.groupby([model_col, task_col]).nunique().max(axis=None)

for m in results_df[model_col].unique():
    for t in results_df[task_col].unique():
        match_ = results_df[(results_df[model_col] == m) & (results_df[task_col] == t)]
        if len(match_) < experiments_per_model_task_pair:
            print(f"Couldn't find all results for m={m}, t={t} ({len(match_)} < {experiments_per_model_task_pair})")

Finally, save results DF to the results root directory:

[12]:
from utils import get_current_timestamp
results_df.to_csv(Path(RESULTS_ROOT_DIR) / f"aggregated_results.{get_current_timestamp()}.csv")