{ "cells": [ { "cell_type": "markdown", "id": "04c2f12d-f989-4ac9-b90f-3464a8bcca96", "metadata": {}, "source": [ "# Fetch and parse ACS benchmark results under a given directory\n", "Each ACS benchmark run outputs a json file. This script collects all such files under a given root directory, parses them, and aggregates them into a more easily digestable pandas DataFrame." ] }, { "cell_type": "code", "execution_count": 1, "id": "3b241208-d10f-43cf-a486-84c54bbf43c3", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "markdown", "id": "faf6afa4-8648-4d35-9312-65636ea5d0b2", "metadata": {}, "source": [ "**[Action required]** Set `RESULTS_ROOT_DIR` to the root results directory path:" ] }, { "cell_type": "code", "execution_count": 2, "id": "26089a60-81c0-4736-8ba5-99572ec01398", "metadata": {}, "outputs": [], "source": [ "RESULTS_ROOT_DIR = Path(\"/fast/groups/sf\") / \"folktexts-results\" / \"2024-08-28_2\"" ] }, { "cell_type": "markdown", "id": "03d1af7f-d013-40a0-80de-cb9ad65cff6d", "metadata": {}, "source": [ "Set the local path to the root data directory (needed only to train baseline ML methods):" ] }, { "cell_type": "code", "execution_count": 3, "id": "b18dad87-a1ed-495b-93b6-4c8af4043ddc", "metadata": {}, "outputs": [], "source": [ "DATA_DIR = Path(\"/fast/groups/sf\") / \"data\"" ] }, { "cell_type": "markdown", "id": "db7c98c5-2942-4f88-984a-8c9014afe761", "metadata": {}, "source": [ "Important results columns:" ] }, { "cell_type": "code", "execution_count": 4, "id": "e96f0c4f-5683-4150-8cca-93a883f20154", "metadata": {}, "outputs": [], "source": [ "model_col = \"config_model_name\"\n", "task_col = \"config_task_name\"\n", "numeric_prompt_col = \"config_numeric_risk_prompting\"\n", "\n", "feature_subset_col = \"config_feature_subset\"\n", "predictions_path_col = \"predictions_path\"" ] }, { "cell_type": "markdown", "id": "4296ffdf-a9af-43f7-bd0c-20f4ae5f9619", "metadata": {}, "source": [ "Helper function to parse each dictionary containing benchmark results:" ] }, { "cell_type": "code", "execution_count": 5, "id": "0cf4a6c8-ec28-4e66-9f6d-8a4e977d7b60", "metadata": {}, "outputs": [], "source": [ "from utils import (\n", " num_features_helper,\n", " parse_model_name,\n", " get_non_instruction_tuned_name,\n", " prettify_model_name,\n", ")\n", "\n", "def parse_results_dict(dct) -> dict:\n", " \"\"\"Parses results dict and brings all information to the top-level.\"\"\"\n", "\n", " # Make a copy so we don't modify the input object\n", " dct = dct.copy()\n", "\n", " # Discard plots' paths\n", " dct.pop(\"plots\", None)\n", "\n", " # Bring configs to top-level\n", " config = dct.pop(\"config\", {})\n", " for key, val in config.items():\n", " dct[f\"config_{key}\"] = val\n", "\n", " # Parse model name\n", " dct[model_col] = parse_model_name(dct[model_col])\n", " dct[\"base_name\"] = get_non_instruction_tuned_name(dct[model_col])\n", " dct[\"name\"] = prettify_model_name(dct[model_col])\n", "\n", " # Is instruction-tuned model?\n", " dct[\"is_inst\"] = dct[\"base_name\"] != dct[model_col] or \"(it)\" in dct[\"name\"].lower()\n", "\n", " # Log number of features\n", " dct[\"num_features\"] = num_features_helper(dct[feature_subset_col], max_features_return=-1)\n", " dct[\"uses_all_features\"] = (dct[feature_subset_col] is None) or (dct[\"num_features\"] == -1)\n", "\n", " if dct[feature_subset_col] is None:\n", " dct[feature_subset_col] = \"full\"\n", "\n", " # Assert all results are at the top-level\n", " assert not any(isinstance(val, dict) for val in dct.values())\n", " return dct\n" ] }, { "cell_type": "markdown", "id": "02009a86-e099-4ec0-8676-8d66190ceddb", "metadata": {}, "source": [ "Iteratively search the root directory for results files matching the given regex:" ] }, { "cell_type": "code", "execution_count": 6, "id": "aaefa99d-dbd7-40a3-a3c1-7571d3811409", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b984f67ca7644ba998983af5385ba147", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Found 241 benchmark results.\n" ] } ], "source": [ "from utils import find_files, load_json\n", "\n", "# Results file name pattern\n", "pattern = r'^results.bench-(?P\\d+)[.]json$'\n", "\n", "# Find results files and aggregate\n", "results = {}\n", "for file_path in tqdm(find_files(RESULTS_ROOT_DIR, pattern)):\n", " results[Path(file_path).parent.name] = parse_results_dict(load_json(file_path))\n", "\n", "if len(results) == 0:\n", " raise RuntimeError(f\"Couldn't find any results at {RESULTS_ROOT_DIR}\")\n", "else:\n", " print(f\"Found {len(results)} benchmark results.\")" ] }, { "cell_type": "markdown", "id": "0ecc0d33-902d-492a-89c6-d9729fe69fa1", "metadata": {}, "source": [ "Aggregate results into a single DataFrame, generate a unique identifier for each row, and drop potential duplicates:" ] }, { "cell_type": "code", "execution_count": 7, "id": "94f1900f-fea3-4872-b722-ee1cd3f5f7e1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "df.shape=(241, 58)\n", "Dropping 31 duplicates!\n", "df.shape=(210, 58)\n" ] } ], "source": [ "df = pd.DataFrame(list(results.values()))\n", "\n", "def row_id(row) -> str:\n", " \"\"\"Unique row identifier.\"\"\"\n", " numeric_or_multiple_choice = \"Num\" if row[numeric_prompt_col] else \"QA\"\n", " return f\"{row[model_col]}__{row[task_col]}__{row['num_features']}__{numeric_or_multiple_choice}\"\n", "\n", "print(f\"{df.shape=}\")\n", "df[\"id\"] = df.apply(row_id, axis=1)\n", "\n", "# Drop duplicates\n", "len_with_dups = len(df)\n", "df = df.drop_duplicates(subset=[\"name\", \"is_inst\", \"num_features\", task_col, numeric_prompt_col])\n", "df = df.set_index(\"id\", drop=True, verify_integrity=True)\n", "\n", "if len_with_dups != len(df):\n", " print(f\"Dropping {len_with_dups - len(df)} duplicates!\")\n", " print(f\"{df.shape=}\")" ] }, { "cell_type": "markdown", "id": "97cd24df-3c1f-49c0-bb50-a6fdba22e0fb", "metadata": {}, "source": [ "Load scores DFs and analyze score distribution:" ] }, { "cell_type": "code", "execution_count": 8, "id": "43ba0fbf-b886-4f2a-8a01-564aa6f0f2c6", "metadata": {}, "outputs": [], "source": [ "def load_model_scores_df(df_row: pd.Series) -> pd.DataFrame:\n", " \"\"\"Loads csv containing model scores corresponding to the given DF row.\"\"\"\n", " if predictions_path_col in df_row and not pd.isna(df_row[predictions_path_col]):\n", " return pd.read_csv(df_row[predictions_path_col], index_col=0)\n", " return None" ] }, { "cell_type": "code", "execution_count": 9, "id": "54e22896-1d3d-4d95-bb90-2b453f58f09a", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "64f4e6718312499b84fb8c9637ff10fd", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/210 [00:00= fit_thr).astype(int))\n", " opt_acc = metrics.accuracy_score(labels, (risk_scores >= opt_thr).astype(int))\n", "\n", " # Save results\n", " scores_stats[row_id] = {\n", " fit_thresh_col: fit_thr,\n", " fit_acc_col: fit_acc,\n", " optimal_thres_col: opt_thr,\n", " optimal_acc_col: opt_acc,\n", " score_stdev_col: np.std(risk_scores),\n", " score_mean_col: np.mean(risk_scores),\n", " }" ] }, { "cell_type": "markdown", "id": "b1a58b87-2de8-473a-badb-934951d1bcdc", "metadata": {}, "source": [ "Update results DF with scores statistics:" ] }, { "cell_type": "code", "execution_count": 10, "id": "0ba74c81-c008-4010-a9a3-1b511ae445df", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "scores_stats_df.shape=(210, 6)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
accuracyaccuracy_diffaccuracy_ratiobalanced_accuracybalanced_accuracy_diffbalanced_accuracy_ratiobrier_score_losseceece_quantileequalized_odds_diff...nameis_instnum_featuresuses_all_featuresfit_thresh_on_100fit_thresh_accuracyoptimal_threshoptimal_thresh_accuracyscore_stdevscore_mean
gemma-2-27b__ACSTravelTime__-1__Num0.5393920.2580000.6560000.5045000.1001840.8232040.3880770.347071NaN0.221264...Gemma 2 27BFalse-1True0.130000.5146760.2100000.5246920.3110140.219864
gemma-1.1-2b-it__ACSEmployment__-1__QA0.4628740.2353840.6318810.4273720.0916400.8007180.4083980.3821430.3763080.212855...Gemma 2B (it)True-1True0.020330.4820540.0075770.4853950.2321830.216562
\n", "

2 rows × 64 columns

\n", "
" ], "text/plain": [ " accuracy accuracy_diff \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.539392 0.258000 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.462874 0.235384 \n", "\n", " accuracy_ratio balanced_accuracy \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.656000 0.504500 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.631881 0.427372 \n", "\n", " balanced_accuracy_diff \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.100184 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.091640 \n", "\n", " balanced_accuracy_ratio \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.823204 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.800718 \n", "\n", " brier_score_loss ece \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.388077 0.347071 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.408398 0.382143 \n", "\n", " ece_quantile equalized_odds_diff \\\n", "gemma-2-27b__ACSTravelTime__-1__Num NaN 0.221264 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.376308 0.212855 \n", "\n", " ... name is_inst \\\n", "gemma-2-27b__ACSTravelTime__-1__Num ... Gemma 2 27B False \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA ... Gemma 2B (it) True \n", "\n", " num_features uses_all_features \\\n", "gemma-2-27b__ACSTravelTime__-1__Num -1 True \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA -1 True \n", "\n", " fit_thresh_on_100 \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.13000 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.02033 \n", "\n", " fit_thresh_accuracy optimal_thresh \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.514676 0.210000 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.482054 0.007577 \n", "\n", " optimal_thresh_accuracy score_stdev \\\n", "gemma-2-27b__ACSTravelTime__-1__Num 0.524692 0.311014 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.485395 0.232183 \n", "\n", " score_mean \n", "gemma-2-27b__ACSTravelTime__-1__Num 0.219864 \n", "gemma-1.1-2b-it__ACSEmployment__-1__QA 0.216562 \n", "\n", "[2 rows x 64 columns]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "scores_stats_df = pd.DataFrame(scores_stats.values(), index=list(scores_stats.keys()))\n", "print(f\"{scores_stats_df.shape=}\")\n", "\n", "results_df = pd.concat((df, scores_stats_df), axis=\"columns\")\n", "results_df.sample(2)" ] }, { "cell_type": "markdown", "id": "2d04ff64-39c4-43cc-9df2-fa4e099c4e56", "metadata": {}, "source": [ "Check if any results are missing:" ] }, { "cell_type": "code", "execution_count": 11, "id": "f1faf926-de0f-442e-baf7-c3029f5386e4", "metadata": {}, "outputs": [], "source": [ "experiments_per_model_task_pair = results_df.groupby([model_col, task_col]).nunique().max(axis=None)\n", "\n", "for m in results_df[model_col].unique():\n", " for t in results_df[task_col].unique():\n", " match_ = results_df[(results_df[model_col] == m) & (results_df[task_col] == t)]\n", " if len(match_) < experiments_per_model_task_pair:\n", " print(f\"Couldn't find all results for m={m}, t={t} ({len(match_)} < {experiments_per_model_task_pair})\")" ] }, { "cell_type": "markdown", "id": "a6e859fc-9d82-436d-a238-11fd008da44c", "metadata": {}, "source": [ "Finally, save results DF to the results root directory:" ] }, { "cell_type": "code", "execution_count": 12, "id": "9725afa2-b6f8-46b1-86e7-57ae82ef05d9", "metadata": {}, "outputs": [], "source": [ "from utils import get_current_timestamp\n", "results_df.to_csv(Path(RESULTS_ROOT_DIR) / f\"aggregated_results.{get_current_timestamp()}.csv\")" ] }, { "cell_type": "markdown", "id": "c3a6beb5-b44b-4f81-ae1a-d027afb2c5f4", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 5 }