diff --git a/python/housing_data/build_places.py b/python/housing_data/build_places.py index 747ec93..40d30c4 100644 --- a/python/housing_data/build_places.py +++ b/python/housing_data/build_places.py @@ -293,7 +293,7 @@ def get_name_spelling(places_df: pd.DataFrame) -> pd.Series: def load_places( - data_repo_path: Optional[Path], counties_population_df: pd.DataFrame = None + data_repo_path: Path, counties_population_df: pd.DataFrame = None ) -> tuple[pd.DataFrame, pd.DataFrame]: raw_places_df = pd.concat( [ @@ -310,7 +310,7 @@ def load_places( raw_places_df.to_parquet(PUBLIC_DIR / "places_annual_without_population.parquet") place_populations_df = place_population.get_place_population_estimates( - data_path=data_repo_path / PLACE_POPULATION_DIR if data_repo_path else None + data_path=data_repo_path / PLACE_POPULATION_DIR ) place_populations_df = fix_nyc_boroughs_population( place_populations_df, counties_population_df diff --git a/python/housing_data/county_population.py b/python/housing_data/county_population.py index db998ef..8f2a1ca 100644 --- a/python/housing_data/county_population.py +++ b/python/housing_data/county_population.py @@ -4,7 +4,6 @@ import pandas as pd import us from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_url_text from housing_data.fips_crosswalk import load_fips_crosswalk @@ -119,11 +118,7 @@ def get_county_fips_crosswalk(data_repo_path: Path) -> pd.DataFrame: def get_county_populations_1990s(data_path: Path) -> pd.DataFrame: - table_text = get_url_text( - "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/counties/totals/99c8_00.txt", - data_path, - encoding="latin_1", - ) + table_text = (data_path / "99c8_00.txt").read_text(encoding="latin_1") table_text = table_text[: table_text.index("Block 2")].strip() diff --git a/python/housing_data/data_loading_helpers.py b/python/housing_data/data_loading_helpers.py index 3a7419f..dd14446 100644 --- a/python/housing_data/data_loading_helpers.py +++ b/python/housing_data/data_loading_helpers.py @@ -34,10 +34,3 @@ def get_url_text( else: web_url = os.path.join(web_prefix, common_path) return requests.get(web_url).text - - -def get_path(url: str, data_path: Optional[Path]) -> str: - if data_path is not None: - return str(Path(data_path, Path(url).name)) - else: - return url diff --git a/python/housing_data/place_population.py b/python/housing_data/place_population.py index 137bd61..b062ed1 100644 --- a/python/housing_data/place_population.py +++ b/python/housing_data/place_population.py @@ -5,15 +5,10 @@ import numpy as np import pandas as pd from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_path, get_url_text -def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame: - df = pd.read_fwf( - get_path( - "https://www2.census.gov/geo/tiger/PREVGENZ/pl/us_places.txt", data_path - ) - ) +def _get_places_crosswalk_df(data_path: Path) -> pd.DataFrame: + df = pd.read_fwf(data_path / "us_places.txt") df["State Code"] = df["CENSUS"] // 10000 df["Place Code"] = df["CENSUS"] % 10000 @@ -79,7 +74,7 @@ def get_unincorporated_places_populations_1980() -> pd.DataFrame: return remainder_df -def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame: +def get_place_populations_1980(data_path: Path) -> pd.DataFrame: # Assuming this is run from `python/` # For the header row, use the nice descriptive names that IPUMS provides rather than the code names df = pd.read_csv("../raw_data/nhgis0015_ds104_1980_place_070.csv", header=1) @@ -147,12 +142,8 @@ def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame: return df -def _load_raw_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame: - tables = get_url_text( - "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/" - "2000-subcounties-evaluation-estimates/sc2000f_us.txt", - data_path, - ).split("\f") +def _load_raw_place_populations_1990s(data_path: Path) -> pd.DataFrame: + tables = (data_path / "sc2000f_us.txt").read_text().split("\f") common_cols = [ "Block", @@ -278,7 +269,7 @@ def remove_duplicate_cities(df: pd.DataFrame) -> pd.DataFrame: return df[~place_state_tuples.isin(dupe_cities)] -def get_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame: +def get_place_populations_1990s(data_path: Path) -> pd.DataFrame: combined_df = _load_raw_place_populations_1990s(data_path) city_rows = ( @@ -396,14 +387,8 @@ def _melt_df( ) -def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/cities/sub-est00int.csv", - data_path, - ), - encoding="latin_1", - ) +def get_place_populations_2000s(data_path: Path) -> pd.DataFrame: + df = pd.read_csv(data_path / "sub-est00int.csv", encoding="latin_1") return _melt_df( df, years=list(range(2000, 2011)), @@ -412,26 +397,14 @@ def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame: ) -def get_place_populations_2010s(data_path: Optional[Path]) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/SUB-EST2020_ALL.csv", - data_path, - ), - encoding="latin_1", - ) +def get_place_populations_2010s(data_path: Path) -> pd.DataFrame: + df = pd.read_csv(data_path / "SUB-EST2020_ALL.csv", encoding="latin_1") return _melt_df(df, years=list(range(2010, 2021))) -def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/sub-est2024.csv", - data_path, - ), - encoding="latin_1", - ) +def get_place_populations_2020s(data_path: Path) -> pd.DataFrame: + df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1") df = _melt_df(df, years=list(range(2020, 2025))) df = impute_2025_population(df) return df @@ -482,7 +455,7 @@ def interpolate_1980s_populations( return interp_df -def get_place_population_estimates(data_path: Optional[Path] = None) -> pd.DataFrame: +def get_place_population_estimates(data_path: Path) -> pd.DataFrame: """ Returns a DataFrame with the columns: - state_code (int) diff --git a/python/housing_data/state_population.py b/python/housing_data/state_population.py index fade175..f306d30 100644 --- a/python/housing_data/state_population.py +++ b/python/housing_data/state_population.py @@ -4,7 +4,6 @@ import pandas as pd import us from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_path, get_url_text DIVISIONS = { "New England": [ @@ -75,10 +74,7 @@ def _line_to_cols(row: str) -> list[str]: def get_state_populations_1980s(data_path: Path) -> pd.DataFrame: - states_80s_text = get_url_text( - "https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt", - data_path, - ) + states_80s_text = (data_path / "st8090ts.txt").read_text() handle = StringIO(states_80s_text) for _ in range(10): @@ -168,10 +164,7 @@ def get_state_populations_1990s(data_path: Path) -> pd.DataFrame: def get_state_populations_2000s(data_path: Path) -> pd.DataFrame: df = pd.read_excel( - get_path( - "https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls", - data_path, - ), + data_path / "st-est00int-01.xls", skiprows=3, skipfooter=8, ) @@ -206,12 +199,7 @@ def _melt_df(df: pd.DataFrame, years: list[int]) -> pd.DataFrame: def get_state_populations_2010s(data_path: Path) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/totals/nst-est2020-alldata.csv", - data_path, - ) - ) + df = pd.read_csv(data_path / "nst-est2020-alldata.csv") return _melt_df(df, list(range(2010, 2020)))