{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "04c2f12d-f989-4ac9-b90f-3464a8bcca96",
   "metadata": {},
   "source": [
    "# Fetch and parse ACS benchmark results under a given directory\n",
    "Each ACS benchmark run outputs a json file. This script collects all such files under a given root directory, parses them, and aggregates them into a more easily digestable pandas DataFrame."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3b241208-d10f-43cf-a486-84c54bbf43c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm.auto import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "faf6afa4-8648-4d35-9312-65636ea5d0b2",
   "metadata": {},
   "source": [
    "**[Action required]** Set `RESULTS_ROOT_DIR` to the root results directory path:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "26089a60-81c0-4736-8ba5-99572ec01398",
   "metadata": {},
   "outputs": [],
   "source": [
    "RESULTS_ROOT_DIR = Path(\"/fast/groups/sf\") / \"folktexts-results\" / \"2024-08-28_2\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03d1af7f-d013-40a0-80de-cb9ad65cff6d",
   "metadata": {},
   "source": [
    "Set the local path to the root data directory (needed only to train baseline ML methods):"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "b18dad87-a1ed-495b-93b6-4c8af4043ddc",
   "metadata": {},
   "outputs": [],
   "source": [
    "DATA_DIR = Path(\"/fast/groups/sf\") / \"data\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "db7c98c5-2942-4f88-984a-8c9014afe761",
   "metadata": {},
   "source": [
    "Important results columns:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e96f0c4f-5683-4150-8cca-93a883f20154",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_col = \"config_model_name\"\n",
    "task_col = \"config_task_name\"\n",
    "numeric_prompt_col = \"config_numeric_risk_prompting\"\n",
    "\n",
    "feature_subset_col = \"config_feature_subset\"\n",
    "predictions_path_col = \"predictions_path\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4296ffdf-a9af-43f7-bd0c-20f4ae5f9619",
   "metadata": {},
   "source": [
    "Helper function to parse each dictionary containing benchmark results:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "0cf4a6c8-ec28-4e66-9f6d-8a4e977d7b60",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import (\n",
    "    num_features_helper,\n",
    "    parse_model_name,\n",
    "    get_non_instruction_tuned_name,\n",
    "    prettify_model_name,\n",
    ")\n",
    "\n",
    "def parse_results_dict(dct) -> dict:\n",
    "    \"\"\"Parses results dict and brings all information to the top-level.\"\"\"\n",
    "\n",
    "    # Make a copy so we don't modify the input object\n",
    "    dct = dct.copy()\n",
    "\n",
    "    # Discard plots' paths\n",
    "    dct.pop(\"plots\", None)\n",
    "\n",
    "    # Bring configs to top-level\n",
    "    config = dct.pop(\"config\", {})\n",
    "    for key, val in config.items():\n",
    "        dct[f\"config_{key}\"] = val\n",
    "\n",
    "    # Parse model name\n",
    "    dct[model_col] = parse_model_name(dct[model_col])\n",
    "    dct[\"base_name\"] = get_non_instruction_tuned_name(dct[model_col])\n",
    "    dct[\"name\"] = prettify_model_name(dct[model_col])\n",
    "\n",
    "    # Is instruction-tuned model?\n",
    "    dct[\"is_inst\"] = dct[\"base_name\"] != dct[model_col] or \"(it)\" in dct[\"name\"].lower()\n",
    "\n",
    "    # Log number of features\n",
    "    dct[\"num_features\"] = num_features_helper(dct[feature_subset_col], max_features_return=-1)\n",
    "    dct[\"uses_all_features\"] = (dct[feature_subset_col] is None) or (dct[\"num_features\"] == -1)\n",
    "\n",
    "    if dct[feature_subset_col] is None:\n",
    "        dct[feature_subset_col] = \"full\"\n",
    "\n",
    "    # Assert all results are at the top-level\n",
    "    assert not any(isinstance(val, dict) for val in dct.values())\n",
    "    return dct\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02009a86-e099-4ec0-8676-8d66190ceddb",
   "metadata": {},
   "source": [
    "Iteratively search the root directory for results files matching the given regex:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "aaefa99d-dbd7-40a3-a3c1-7571d3811409",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b984f67ca7644ba998983af5385ba147",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 241 benchmark results.\n"
     ]
    }
   ],
   "source": [
    "from utils import find_files, load_json\n",
    "\n",
    "# Results file name pattern\n",
    "pattern = r'^results.bench-(?P<hash>\\d+)[.]json$'\n",
    "\n",
    "# Find results files and aggregate\n",
    "results = {}\n",
    "for file_path in tqdm(find_files(RESULTS_ROOT_DIR, pattern)):\n",
    "    results[Path(file_path).parent.name] = parse_results_dict(load_json(file_path))\n",
    "\n",
    "if len(results) == 0:\n",
    "    raise RuntimeError(f\"Couldn't find any results at {RESULTS_ROOT_DIR}\")\n",
    "else:\n",
    "    print(f\"Found {len(results)} benchmark results.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ecc0d33-902d-492a-89c6-d9729fe69fa1",
   "metadata": {},
   "source": [
    "Aggregate results into a single DataFrame, generate a unique identifier for each row, and drop potential duplicates:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "94f1900f-fea3-4872-b722-ee1cd3f5f7e1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "df.shape=(241, 58)\n",
      "Dropping 31 duplicates!\n",
      "df.shape=(210, 58)\n"
     ]
    }
   ],
   "source": [
    "df = pd.DataFrame(list(results.values()))\n",
    "\n",
    "def row_id(row) -> str:\n",
    "    \"\"\"Unique row identifier.\"\"\"\n",
    "    numeric_or_multiple_choice = \"Num\" if row[numeric_prompt_col] else \"QA\"\n",
    "    return f\"{row[model_col]}__{row[task_col]}__{row['num_features']}__{numeric_or_multiple_choice}\"\n",
    "\n",
    "print(f\"{df.shape=}\")\n",
    "df[\"id\"] = df.apply(row_id, axis=1)\n",
    "\n",
    "# Drop duplicates\n",
    "len_with_dups = len(df)\n",
    "df = df.drop_duplicates(subset=[\"name\", \"is_inst\", \"num_features\", task_col, numeric_prompt_col])\n",
    "df = df.set_index(\"id\", drop=True, verify_integrity=True)\n",
    "\n",
    "if len_with_dups != len(df):\n",
    "    print(f\"Dropping {len_with_dups - len(df)} duplicates!\")\n",
    "    print(f\"{df.shape=}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "97cd24df-3c1f-49c0-bb50-a6fdba22e0fb",
   "metadata": {},
   "source": [
    "Load scores DFs and analyze score distribution:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "43ba0fbf-b886-4f2a-8a01-564aa6f0f2c6",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_model_scores_df(df_row: pd.Series) -> pd.DataFrame:\n",
    "    \"\"\"Loads csv containing model scores corresponding to the given DF row.\"\"\"\n",
    "    if predictions_path_col in df_row and not pd.isna(df_row[predictions_path_col]):\n",
    "        return pd.read_csv(df_row[predictions_path_col], index_col=0)\n",
    "    return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "54e22896-1d3d-4d95-bb90-2b453f58f09a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "64f4e6718312499b84fb8c9637ff10fd",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/210 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from folktexts.evaluation import compute_best_threshold\n",
    "from sklearn import metrics\n",
    "from scipy import stats\n",
    "\n",
    "# Number of samples used to fit the one-parameter binarization threshold!\n",
    "N = 100\n",
    "\n",
    "fit_thresh_col = f\"fit_thresh_on_{N}\"\n",
    "fit_acc_col = f\"fit_thresh_accuracy\"\n",
    "\n",
    "optimal_thres_col = \"optimal_thresh\"\n",
    "optimal_acc_col = \"optimal_thresh_accuracy\"\n",
    "\n",
    "score_stdev_col = \"score_stdev\"\n",
    "score_mean_col = \"score_mean\"\n",
    "\n",
    "scores_stats = dict()\n",
    "for row_id, row in tqdm(df.iterrows(), total=len(df)):\n",
    "\n",
    "    # Load model scores\n",
    "    scores_df = load_model_scores_df(row)\n",
    "    if scores_df is None:\n",
    "        logging.error(f\"Couldn't find scores for {row_id}\")\n",
    "\n",
    "    # Extract scores and labels\n",
    "    risk_scores = scores_df[\"risk_score\"].to_numpy()\n",
    "    labels = scores_df[\"label\"].to_numpy()\n",
    "\n",
    "    # Sample N rows to fit threshold\n",
    "    scores_df_sample = scores_df.sample(n=N, random_state=42)\n",
    "\n",
    "    # Compute optimal threshold on each data sample\n",
    "    fit_thr = compute_best_threshold(y_true=scores_df_sample[\"label\"], y_pred_scores=scores_df_sample[\"risk_score\"])\n",
    "    opt_thr = compute_best_threshold(y_true=labels, y_pred_scores=risk_scores)\n",
    "\n",
    "    # Evaluate accuracy\n",
    "    fit_acc = metrics.accuracy_score(labels, (risk_scores >= fit_thr).astype(int))\n",
    "    opt_acc = metrics.accuracy_score(labels, (risk_scores >= opt_thr).astype(int))\n",
    "\n",
    "    # Save results\n",
    "    scores_stats[row_id] = {\n",
    "        fit_thresh_col: fit_thr,\n",
    "        fit_acc_col: fit_acc,\n",
    "        optimal_thres_col: opt_thr,\n",
    "        optimal_acc_col: opt_acc,\n",
    "        score_stdev_col: np.std(risk_scores),\n",
    "        score_mean_col: np.mean(risk_scores),\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b1a58b87-2de8-473a-badb-934951d1bcdc",
   "metadata": {},
   "source": [
    "Update results DF with scores statistics:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0ba74c81-c008-4010-a9a3-1b511ae445df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "scores_stats_df.shape=(210, 6)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>accuracy</th>\n",
       "      <th>accuracy_diff</th>\n",
       "      <th>accuracy_ratio</th>\n",
       "      <th>balanced_accuracy</th>\n",
       "      <th>balanced_accuracy_diff</th>\n",
       "      <th>balanced_accuracy_ratio</th>\n",
       "      <th>brier_score_loss</th>\n",
       "      <th>ece</th>\n",
       "      <th>ece_quantile</th>\n",
       "      <th>equalized_odds_diff</th>\n",
       "      <th>...</th>\n",
       "      <th>name</th>\n",
       "      <th>is_inst</th>\n",
       "      <th>num_features</th>\n",
       "      <th>uses_all_features</th>\n",
       "      <th>fit_thresh_on_100</th>\n",
       "      <th>fit_thresh_accuracy</th>\n",
       "      <th>optimal_thresh</th>\n",
       "      <th>optimal_thresh_accuracy</th>\n",
       "      <th>score_stdev</th>\n",
       "      <th>score_mean</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>gemma-2-27b__ACSTravelTime__-1__Num</th>\n",
       "      <td>0.539392</td>\n",
       "      <td>0.258000</td>\n",
       "      <td>0.656000</td>\n",
       "      <td>0.504500</td>\n",
       "      <td>0.100184</td>\n",
       "      <td>0.823204</td>\n",
       "      <td>0.388077</td>\n",
       "      <td>0.347071</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.221264</td>\n",
       "      <td>...</td>\n",
       "      <td>Gemma 2 27B</td>\n",
       "      <td>False</td>\n",
       "      <td>-1</td>\n",
       "      <td>True</td>\n",
       "      <td>0.13000</td>\n",
       "      <td>0.514676</td>\n",
       "      <td>0.210000</td>\n",
       "      <td>0.524692</td>\n",
       "      <td>0.311014</td>\n",
       "      <td>0.219864</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gemma-1.1-2b-it__ACSEmployment__-1__QA</th>\n",
       "      <td>0.462874</td>\n",
       "      <td>0.235384</td>\n",
       "      <td>0.631881</td>\n",
       "      <td>0.427372</td>\n",
       "      <td>0.091640</td>\n",
       "      <td>0.800718</td>\n",
       "      <td>0.408398</td>\n",
       "      <td>0.382143</td>\n",
       "      <td>0.376308</td>\n",
       "      <td>0.212855</td>\n",
       "      <td>...</td>\n",
       "      <td>Gemma 2B (it)</td>\n",
       "      <td>True</td>\n",
       "      <td>-1</td>\n",
       "      <td>True</td>\n",
       "      <td>0.02033</td>\n",
       "      <td>0.482054</td>\n",
       "      <td>0.007577</td>\n",
       "      <td>0.485395</td>\n",
       "      <td>0.232183</td>\n",
       "      <td>0.216562</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 64 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        accuracy  accuracy_diff  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num     0.539392       0.258000   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA  0.462874       0.235384   \n",
       "\n",
       "                                        accuracy_ratio  balanced_accuracy  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num           0.656000           0.504500   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA        0.631881           0.427372   \n",
       "\n",
       "                                        balanced_accuracy_diff  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num                   0.100184   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA                0.091640   \n",
       "\n",
       "                                        balanced_accuracy_ratio  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num                    0.823204   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA                 0.800718   \n",
       "\n",
       "                                        brier_score_loss       ece  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num             0.388077  0.347071   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA          0.408398  0.382143   \n",
       "\n",
       "                                        ece_quantile  equalized_odds_diff  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num              NaN             0.221264   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA      0.376308             0.212855   \n",
       "\n",
       "                                        ...           name  is_inst  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num     ...    Gemma 2 27B    False   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA  ...  Gemma 2B (it)     True   \n",
       "\n",
       "                                        num_features  uses_all_features  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num               -1               True   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA            -1               True   \n",
       "\n",
       "                                        fit_thresh_on_100  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num               0.13000   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA            0.02033   \n",
       "\n",
       "                                        fit_thresh_accuracy  optimal_thresh  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num                0.514676        0.210000   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA             0.482054        0.007577   \n",
       "\n",
       "                                        optimal_thresh_accuracy score_stdev  \\\n",
       "gemma-2-27b__ACSTravelTime__-1__Num                    0.524692    0.311014   \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA                 0.485395    0.232183   \n",
       "\n",
       "                                        score_mean  \n",
       "gemma-2-27b__ACSTravelTime__-1__Num       0.219864  \n",
       "gemma-1.1-2b-it__ACSEmployment__-1__QA    0.216562  \n",
       "\n",
       "[2 rows x 64 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scores_stats_df = pd.DataFrame(scores_stats.values(), index=list(scores_stats.keys()))\n",
    "print(f\"{scores_stats_df.shape=}\")\n",
    "\n",
    "results_df = pd.concat((df, scores_stats_df), axis=\"columns\")\n",
    "results_df.sample(2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2d04ff64-39c4-43cc-9df2-fa4e099c4e56",
   "metadata": {},
   "source": [
    "Check if any results are missing:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f1faf926-de0f-442e-baf7-c3029f5386e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "experiments_per_model_task_pair = results_df.groupby([model_col, task_col]).nunique().max(axis=None)\n",
    "\n",
    "for m in results_df[model_col].unique():\n",
    "    for t in results_df[task_col].unique():\n",
    "        match_ = results_df[(results_df[model_col] == m) & (results_df[task_col] == t)]\n",
    "        if len(match_) < experiments_per_model_task_pair:\n",
    "            print(f\"Couldn't find all results for m={m}, t={t} ({len(match_)} < {experiments_per_model_task_pair})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6e859fc-9d82-436d-a238-11fd008da44c",
   "metadata": {},
   "source": [
    "Finally, save results DF to the results root directory:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "9725afa2-b6f8-46b1-86e7-57ae82ef05d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "from utils import get_current_timestamp\n",
    "results_df.to_csv(Path(RESULTS_ROOT_DIR) / f\"aggregated_results.{get_current_timestamp()}.csv\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c3a6beb5-b44b-4f81-ae1a-d027afb2c5f4",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}