{ "cells": [ { "cell_type": "markdown", "id": "a436d249-9dfc-4d25-9695-549cb440ea18", "metadata": {}, "source": [ "# Render paper plots and tables" ] }, { "cell_type": "code", "execution_count": 1, "id": "a7df2c18-8ad5-4733-b77e-68afde7064a1", "metadata": {}, "outputs": [], "source": [ "import math\n", "import logging\n", "from pathlib import Path\n", "\n", "import numpy as np\n", "import pandas as pd\n", "from tqdm.auto import tqdm" ] }, { "cell_type": "markdown", "id": "a247e64e-8d4c-4dc7-beb2-cf70e1492fc2", "metadata": {}, "source": [ "**Note:** _Change_ the following path to the aggregated results file in your local system \n", "(can be obtained using the `parse-acs-results.ipynb` notebook)." ] }, { "cell_type": "code", "execution_count": 2, "id": "6d4d0fde-5cea-4ed8-b496-00627aac840f", "metadata": {}, "outputs": [], "source": [ "ACS_AGG_RESULTS_PATH = Path(\"../results\") / \"aggregated-results.2024-08.csv\"\n", "ACS_AGG_RESULTS_PATH = Path(ACS_AGG_RESULTS_PATH)" ] }, { "cell_type": "code", "execution_count": 3, "id": "ddc248c4-a5c5-4c95-9f2a-3d386c830520", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "results_df.shape=(210, 64)\n" ] }, { "data": { "text/html": [ "
\n", " | accuracy | \n", "accuracy_diff | \n", "accuracy_ratio | \n", "balanced_accuracy | \n", "balanced_accuracy_diff | \n", "balanced_accuracy_ratio | \n", "brier_score_loss | \n", "ece | \n", "ece_quantile | \n", "equalized_odds_diff | \n", "... | \n", "name | \n", "is_inst | \n", "num_features | \n", "uses_all_features | \n", "fit_thresh_on_100 | \n", "fit_thresh_accuracy | \n", "optimal_thresh | \n", "optimal_thresh_accuracy | \n", "score_stdev | \n", "score_mean | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
penai/gpt-4o-mini__ACSTravelTime__-1__Num | \n", "0.570906 | \n", "0.315286 | \n", "0.624340 | \n", "0.553706 | \n", "0.070915 | \n", "0.876372 | \n", "0.267584 | \n", "0.149517 | \n", "NaN | \n", "0.382569 | \n", "... | \n", "GPT 4o mini (it) | \n", "True | \n", "-1 | \n", "True | \n", "0.250000 | \n", "0.510013 | \n", "0.450000 | \n", "0.570968 | \n", "0.223652 | \n", "0.384149 | \n", "
penai/gpt-4o-mini__ACSTravelTime__-1__QA | \n", "0.551154 | \n", "0.119294 | \n", "0.812222 | \n", "0.588927 | \n", "0.113947 | \n", "0.823835 | \n", "0.404025 | \n", "0.393327 | \n", "0.393301 | \n", "0.274655 | \n", "... | \n", "GPT 4o mini (it) | \n", "True | \n", "-1 | \n", "True | \n", "0.998499 | \n", "0.602202 | \n", "0.970688 | \n", "0.593339 | \n", "0.365205 | \n", "0.772858 | \n", "
2 rows × 64 columns
\n", "\n", " | threshold | \n", "n_samples | \n", "n_positives | \n", "n_negatives | \n", "model_name | \n", "accuracy | \n", "tpr | \n", "fnr | \n", "fpr | \n", "tnr | \n", "... | \n", "equalized_odds_diff | \n", "roc_auc | \n", "ece | \n", "ece_quantile | \n", "predictions_path | \n", "config_task_name | \n", "config_model_name | \n", "name | \n", "num_features | \n", "uses_all_features | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
new_index | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
XGBoost_ACSPublicCoverage | \n", "0.5 | \n", "113829 | \n", "33971 | \n", "79858 | \n", "NaN | \n", "0.801650 | \n", "0.515175 | \n", "0.484825 | \n", "0.076486 | \n", "0.923514 | \n", "... | \n", "0.368044 | \n", "0.839742 | \n", "0.004371 | \n", "0.004271 | \n", "/fast/groups/sf/folktexts-results/2024-07-03/b... | \n", "ACSPublicCoverage | \n", "XGBoost | \n", "XGBoost | \n", "-1 | \n", "True | \n", "
GBM_ACSIncome | \n", "0.5 | \n", "166450 | \n", "61233 | \n", "105217 | \n", "NaN | \n", "0.813584 | \n", "0.727973 | \n", "0.272027 | \n", "0.136594 | \n", "0.863406 | \n", "... | \n", "0.630389 | \n", "0.890792 | \n", "0.007721 | \n", "0.007146 | \n", "/fast/groups/sf/folktexts-results/2024-07-03/b... | \n", "ACSIncome | \n", "GBM | \n", "GBM | \n", "-1 | \n", "True | \n", "
2 rows × 42 columns
\n", "\n", " | accuracy | \n", "accuracy_diff | \n", "accuracy_ratio | \n", "balanced_accuracy | \n", "balanced_accuracy_diff | \n", "balanced_accuracy_ratio | \n", "brier_score_loss | \n", "ece | \n", "ece_quantile | \n", "equalized_odds_diff | \n", "... | \n", "num_features | \n", "uses_all_features | \n", "fit_thresh_on_100 | \n", "fit_thresh_accuracy | \n", "optimal_thresh | \n", "optimal_thresh_accuracy | \n", "score_stdev | \n", "score_mean | \n", "model_size | \n", "model_family | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Meta-Llama-3-70B__ACSIncome__-1__Num | \n", "0.543953 | \n", "0.134809 | \n", "0.771466 | \n", "0.636284 | \n", "0.078752 | \n", "0.887756 | \n", "0.237630 | \n", "0.270157 | \n", "NaN | \n", "0.191080 | \n", "... | \n", "-1 | \n", "True | \n", "0.89560 | \n", "0.784722 | \n", "0.8956 | \n", "0.784722 | \n", "0.253299 | \n", "0.638033 | \n", "70 | \n", "Llama | \n", "
Meta-Llama-3-70B-Instruct__ACSIncome__-1__Num | \n", "0.665731 | \n", "0.086680 | \n", "0.880953 | \n", "0.722658 | \n", "0.061832 | \n", "0.919317 | \n", "0.230512 | \n", "0.246092 | \n", "NaN | \n", "0.232483 | \n", "... | \n", "-1 | \n", "True | \n", "0.75718 | \n", "0.735764 | \n", "0.8239 | \n", "0.784848 | \n", "0.290286 | \n", "0.613968 | \n", "70 | \n", "Llama | \n", "
2 rows × 66 columns
\n", "\n", " | White_score_bias | \n", "Black_score_bias | \n", "Asian_score_bias | \n", "White_v_Black_score_bias | \n", "White_v_Asian_score_bias | \n", "Asian_v_Black_score_bias | \n", "
---|---|---|---|---|---|---|
Mistral-7B-v0.1__ACSIncome__-1__Num | \n", "0.353045 | \n", "0.329428 | \n", "0.319214 | \n", "0.023617 | \n", "0.033831 | \n", "-0.010215 | \n", "
gemma-1.1-2b-it__ACSIncome__-1__Num | \n", "0.260100 | \n", "0.403764 | \n", "0.202488 | \n", "-0.143664 | \n", "0.057612 | \n", "-0.201276 | \n", "
Yi-34B-Chat__ACSIncome__-1__Num | \n", "0.210531 | \n", "0.229600 | \n", "0.212839 | \n", "-0.019069 | \n", "-0.002308 | \n", "-0.016761 | \n", "
\n", " | White_score_bias | \n", "Black_score_bias | \n", "Asian_score_bias | \n", "White_v_Black_score_bias | \n", "White_v_Asian_score_bias | \n", "Asian_v_Black_score_bias | \n", "accuracy | \n", "accuracy_diff | \n", "accuracy_ratio | \n", "balanced_accuracy | \n", "... | \n", "name | \n", "is_inst | \n", "num_features | \n", "uses_all_features | \n", "fit_thresh_on_100 | \n", "fit_thresh_accuracy | \n", "optimal_thresh | \n", "optimal_thresh_accuracy | \n", "score_stdev | \n", "score_mean | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Mixtral-8x22B-v0.1__ACSIncome__-1__Num | \n", "0.054322 | \n", "-0.069436 | \n", "0.130153 | \n", "0.123758 | \n", "-0.075831 | \n", "0.199589 | \n", "0.740583 | \n", "0.148527 | \n", "0.828562 | \n", "0.761428 | \n", "... | \n", "Mixtral 8x22B | \n", "False | \n", "-1.0 | \n", "True | \n", "0.65 | \n", "0.769300 | \n", "0.45 | \n", "0.740595 | \n", "0.379947 | \n", "0.411113 | \n", "
Mixtral-8x22B-Instruct-v0.1__ACSIncome__-1__Num | \n", "0.110399 | \n", "0.055872 | \n", "0.156710 | \n", "0.054527 | \n", "-0.046311 | \n", "0.100837 | \n", "0.767882 | \n", "0.096716 | \n", "0.885938 | \n", "0.770218 | \n", "... | \n", "Mixtral 8x22B (it) | \n", "True | \n", "-1.0 | \n", "True | \n", "0.65 | \n", "0.769967 | \n", "0.55 | \n", "0.767882 | \n", "0.298323 | \n", "0.474955 | \n", "
penai/gpt-4o-mini__ACSIncome__-1__Num | \n", "-0.001300 | \n", "-0.019960 | \n", "-0.000407 | \n", "0.018661 | \n", "-0.000893 | \n", "0.019553 | \n", "0.777393 | \n", "0.080767 | \n", "0.904748 | \n", "0.758922 | \n", "... | \n", "GPT 4o mini (it) | \n", "True | \n", "-1.0 | \n", "True | \n", "0.65 | \n", "0.777302 | \n", "0.35 | \n", "0.775512 | \n", "0.317359 | \n", "0.363743 | \n", "
3 rows × 70 columns
\n", "