Source code for folktexts.acs.acs_dataset

"""Module to access ACS data using the folktables package.
"""
from __future__ import annotations

import logging
from pathlib import Path

import pandas as pd
from folktables import ACSDataSource
from folktables.load_acs import state_list

from ..dataset import Dataset
from .acs_tasks import ACSTaskMetadata

DEFAULT_DATA_DIR = Path("~/data").expanduser().resolve()
DEFAULT_TEST_SIZE = 0.1
DEFAULT_VAL_SIZE = 0.1
DEFAULT_SEED = 42

DEFAULT_SURVEY_YEAR = "2018"
DEFAULT_SURVEY_HORIZON = "1-Year"
DEFAULT_SURVEY_UNIT = "person"


[docs] class ACSDataset(Dataset): """Wrapper for ACS folktables datasets.""" def __init__( self, data: pd.DataFrame, full_acs_data: pd.DataFrame, task: ACSTaskMetadata, test_size: float = DEFAULT_TEST_SIZE, val_size: float = DEFAULT_VAL_SIZE, subsampling: float = None, seed: int = 42, ): self._full_acs_data = full_acs_data super().__init__( data=data, task=task, test_size=test_size, val_size=val_size, subsampling=subsampling, seed=seed, )
[docs] @classmethod def make_from_task( cls, task: str | ACSTaskMetadata, cache_dir: str | Path = None, survey_year: str = DEFAULT_SURVEY_YEAR, horizon: str = DEFAULT_SURVEY_HORIZON, survey: str = DEFAULT_SURVEY_UNIT, seed: int = DEFAULT_SEED, **kwargs, ): """Construct an ACSDataset object from a given ACS task. Can customize survey sample parameters (survey year, horizon, survey type). Parameters ---------- task : str | ACSTaskMetadata The name of the ACS task or the task object itself. cache_dir : str | Path, optional The directory where ACS data is (or will be) saved to, by default uses DEFAULT_DATA_DIR. survey_year : str, optional The year from which to load survey data, by default DEFAULT_SURVEY_YEAR. horizon : str, optional The time horizon of survey data to load, by default DEFAULT_SURVEY_HORIZON. survey : str, optional The name of the survey unit to load, by default DEFAULT_SURVEY_UNIT. seed : int, optional The random seed, by default DEFAULT_SEED. **kwargs Extra key-word arguments to be passed to the Dataset constructor. """ # Create "folktables" sub-folder under the given cache dir cache_dir = Path(cache_dir or DEFAULT_DATA_DIR).expanduser().resolve() / "folktables" if not cache_dir.exists(): logging.warning(f"Creating cache directory '{cache_dir}' for ACS data.") cache_dir.mkdir(exist_ok=True, parents=False) # Parse task if given a string task_obj = ACSTaskMetadata.get_task(task) if isinstance(task, str) else task # Load ACS data source print("Loading ACS data...") data_source = ACSDataSource( survey_year=survey_year, horizon=horizon, survey=survey, root_dir=cache_dir.as_posix(), ) # Get full ACS dataset full_acs_data = data_source.get_data( states=state_list, download=True, random_seed=seed) # Parse data for this task parsed_data = cls._parse_task_data(full_acs_data, task_obj) return cls( data=parsed_data, full_acs_data=full_acs_data, task=task_obj, seed=seed, **kwargs, )
@property def task(self) -> ACSTaskMetadata: return self._task @task.setter def task(self, new_task: ACSTaskMetadata): # Parse data rows for new ACS task self._data = self._parse_task_data(self._full_acs_data, new_task) # Re-make train/test/val split self._train_indices, self._test_indices, self._val_indices = ( self._make_train_test_val_split( self._data, self.test_size, self.val_size, self._rng) ) # Check if sub-sampling is necessary (it's applied only to train/test/val indices) if self.subsampling is not None: self._subsample_train_test_val_indices(self.subsampling) self._task = new_task @classmethod def _parse_task_data(cls, full_df: pd.DataFrame, task: ACSTaskMetadata) -> pd.DataFrame: """Parse a DataFrame for compatibility with the given task object. Parameters ---------- full_df : pd.DataFrame Full DataFrame. Some rows and/or columns may be discarded for each task. task : ACSTaskMetadata The task object used to parse the given data. Returns ------- parsed_df : pd.DataFrame Parsed DataFrame in accordance with the given task. """ # Pre-process the data if necessary if isinstance(task, ACSTaskMetadata) and task.folktables_obj is not None: parsed_df = task.folktables_obj._preprocess(full_df) else: parsed_df = full_df # Threshold the target column if necessary if task.target is not None and task.target_threshold is not None and task.get_target() not in parsed_df.columns: parsed_df[task.get_target()] = task.target_threshold.apply_to_column_data(parsed_df[task.target]) return parsed_df