From 3e84c39ac6097d170c4d976f76f208ae36fffc8e Mon Sep 17 00:00:00 2001 From: Sid Kapur Date: Sat, 23 Aug 2025 13:27:53 -0400 Subject: [PATCH 1/5] Remove unnecessary get_path helper --- python/housing_data/data_loading_helpers.py | 7 ---- python/housing_data/place_population.py | 43 ++++----------------- python/housing_data/state_population.py | 14 ++----- 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/python/housing_data/data_loading_helpers.py b/python/housing_data/data_loading_helpers.py index 3a7419f..dd14446 100644 --- a/python/housing_data/data_loading_helpers.py +++ b/python/housing_data/data_loading_helpers.py @@ -34,10 +34,3 @@ def get_url_text( else: web_url = os.path.join(web_prefix, common_path) return requests.get(web_url).text - - -def get_path(url: str, data_path: Optional[Path]) -> str: - if data_path is not None: - return str(Path(data_path, Path(url).name)) - else: - return url diff --git a/python/housing_data/place_population.py b/python/housing_data/place_population.py index 137bd61..fcab028 100644 --- a/python/housing_data/place_population.py +++ b/python/housing_data/place_population.py @@ -5,15 +5,10 @@ import numpy as np import pandas as pd from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_path, get_url_text def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame: - df = pd.read_fwf( - get_path( - "https://www2.census.gov/geo/tiger/PREVGENZ/pl/us_places.txt", data_path - ) - ) + df = pd.read_fwf(data_path / "us_places.txt") df["State Code"] = df["CENSUS"] // 10000 df["Place Code"] = df["CENSUS"] % 10000 @@ -147,12 +142,8 @@ def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame: return df -def _load_raw_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame: - tables = get_url_text( - "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/" - "2000-subcounties-evaluation-estimates/sc2000f_us.txt", - data_path, - ).split("\f") +def _load_raw_place_populations_1990s(data_path: Path) -> pd.DataFrame: + tables = (data_path / "sc2000f_us.txt").read_text().split("\f") common_cols = [ "Block", @@ -396,14 +387,8 @@ def _melt_df( ) -def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/cities/sub-est00int.csv", - data_path, - ), - encoding="latin_1", - ) +def get_place_populations_2000s(data_path: Path) -> pd.DataFrame: + df = pd.read_csv(data_path / "sub-est00int.csv", encoding="latin_1") return _melt_df( df, years=list(range(2000, 2011)), @@ -412,26 +397,14 @@ def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame: ) -def get_place_populations_2010s(data_path: Optional[Path]) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/SUB-EST2020_ALL.csv", - data_path, - ), - encoding="latin_1", - ) +def get_place_populations_2010s(data_path: Path) -> pd.DataFrame: + df = pd.read_csv(data_path / "SUB-EST2020_ALL.csv", encoding="latin_1") return _melt_df(df, years=list(range(2010, 2021))) def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/sub-est2024.csv", - data_path, - ), - encoding="latin_1", - ) + df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1") df = _melt_df(df, years=list(range(2020, 2025))) df = impute_2025_population(df) return df diff --git a/python/housing_data/state_population.py b/python/housing_data/state_population.py index fade175..cd849c7 100644 --- a/python/housing_data/state_population.py +++ b/python/housing_data/state_population.py @@ -4,7 +4,7 @@ import pandas as pd import us from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_path, get_url_text +from housing_data.data_loading_helpers import get_url_text DIVISIONS = { "New England": [ @@ -168,10 +168,7 @@ def get_state_populations_1990s(data_path: Path) -> pd.DataFrame: def get_state_populations_2000s(data_path: Path) -> pd.DataFrame: df = pd.read_excel( - get_path( - "https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls", - data_path, - ), + data_path / "st-est00int-01.xls", skiprows=3, skipfooter=8, ) @@ -206,12 +203,7 @@ def _melt_df(df: pd.DataFrame, years: list[int]) -> pd.DataFrame: def get_state_populations_2010s(data_path: Path) -> pd.DataFrame: - df = pd.read_csv( - get_path( - "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/totals/nst-est2020-alldata.csv", - data_path, - ) - ) + df = pd.read_csv(data_path / "nst-est2020-alldata.csv") return _melt_df(df, list(range(2010, 2020))) From 27a47ecd6830d9f9af2138a2467602fd06ce6dfe Mon Sep 17 00:00:00 2001 From: Sid Kapur Date: Sat, 23 Aug 2025 13:30:33 -0400 Subject: [PATCH 2/5] more cleanup --- python/housing_data/county_population.py | 7 +------ python/housing_data/state_population.py | 6 +----- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/python/housing_data/county_population.py b/python/housing_data/county_population.py index db998ef..8f2a1ca 100644 --- a/python/housing_data/county_population.py +++ b/python/housing_data/county_population.py @@ -4,7 +4,6 @@ import pandas as pd import us from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_url_text from housing_data.fips_crosswalk import load_fips_crosswalk @@ -119,11 +118,7 @@ def get_county_fips_crosswalk(data_repo_path: Path) -> pd.DataFrame: def get_county_populations_1990s(data_path: Path) -> pd.DataFrame: - table_text = get_url_text( - "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/counties/totals/99c8_00.txt", - data_path, - encoding="latin_1", - ) + table_text = (data_path / "99c8_00.txt").read_text(encoding="latin_1") table_text = table_text[: table_text.index("Block 2")].strip() diff --git a/python/housing_data/state_population.py b/python/housing_data/state_population.py index cd849c7..f306d30 100644 --- a/python/housing_data/state_population.py +++ b/python/housing_data/state_population.py @@ -4,7 +4,6 @@ import pandas as pd import us from housing_data.build_data_utils import impute_2025_population -from housing_data.data_loading_helpers import get_url_text DIVISIONS = { "New England": [ @@ -75,10 +74,7 @@ def _line_to_cols(row: str) -> list[str]: def get_state_populations_1980s(data_path: Path) -> pd.DataFrame: - states_80s_text = get_url_text( - "https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt", - data_path, - ) + states_80s_text = (data_path / "st8090ts.txt").read_text() handle = StringIO(states_80s_text) for _ in range(10): From 2938f6ef2f54c10dea0a96d64aa7a93b5fbd9fcb Mon Sep 17 00:00:00 2001 From: Sid Kapur Date: Sat, 23 Aug 2025 13:35:52 -0400 Subject: [PATCH 3/5] fix mypy --- python/housing_data/place_population.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/housing_data/place_population.py b/python/housing_data/place_population.py index fcab028..b062ed1 100644 --- a/python/housing_data/place_population.py +++ b/python/housing_data/place_population.py @@ -7,7 +7,7 @@ from housing_data.build_data_utils import impute_2025_population -def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame: +def _get_places_crosswalk_df(data_path: Path) -> pd.DataFrame: df = pd.read_fwf(data_path / "us_places.txt") df["State Code"] = df["CENSUS"] // 10000 @@ -74,7 +74,7 @@ def get_unincorporated_places_populations_1980() -> pd.DataFrame: return remainder_df -def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame: +def get_place_populations_1980(data_path: Path) -> pd.DataFrame: # Assuming this is run from `python/` # For the header row, use the nice descriptive names that IPUMS provides rather than the code names df = pd.read_csv("../raw_data/nhgis0015_ds104_1980_place_070.csv", header=1) @@ -269,7 +269,7 @@ def remove_duplicate_cities(df: pd.DataFrame) -> pd.DataFrame: return df[~place_state_tuples.isin(dupe_cities)] -def get_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame: +def get_place_populations_1990s(data_path: Path) -> pd.DataFrame: combined_df = _load_raw_place_populations_1990s(data_path) city_rows = ( @@ -403,7 +403,7 @@ def get_place_populations_2010s(data_path: Path) -> pd.DataFrame: return _melt_df(df, years=list(range(2010, 2021))) -def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame: +def get_place_populations_2020s(data_path: Path) -> pd.DataFrame: df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1") df = _melt_df(df, years=list(range(2020, 2025))) df = impute_2025_population(df) @@ -455,7 +455,7 @@ def interpolate_1980s_populations( return interp_df -def get_place_population_estimates(data_path: Optional[Path] = None) -> pd.DataFrame: +def get_place_population_estimates(data_path: Path) -> pd.DataFrame: """ Returns a DataFrame with the columns: - state_code (int) From e7c256b2445de99256a3597553af722c332a4b57 Mon Sep 17 00:00:00 2001 From: Sid Kapur Date: Sat, 23 Aug 2025 13:36:26 -0400 Subject: [PATCH 4/5] fix mypy --- python/housing_data/build_places.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/housing_data/build_places.py b/python/housing_data/build_places.py index 747ec93..3e648b1 100644 --- a/python/housing_data/build_places.py +++ b/python/housing_data/build_places.py @@ -293,7 +293,7 @@ def get_name_spelling(places_df: pd.DataFrame) -> pd.Series: def load_places( - data_repo_path: Optional[Path], counties_population_df: pd.DataFrame = None + data_repo_path: Path, counties_population_df: pd.DataFrame = None ) -> tuple[pd.DataFrame, pd.DataFrame]: raw_places_df = pd.concat( [ From d706a805f9759c38f3640ce9c77f8c2f5dc33f59 Mon Sep 17 00:00:00 2001 From: Sid Kapur Date: Sat, 23 Aug 2025 13:42:13 -0400 Subject: [PATCH 5/5] fix mypy? --- python/housing_data/build_places.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/housing_data/build_places.py b/python/housing_data/build_places.py index 3e648b1..40d30c4 100644 --- a/python/housing_data/build_places.py +++ b/python/housing_data/build_places.py @@ -310,7 +310,7 @@ def load_places( raw_places_df.to_parquet(PUBLIC_DIR / "places_annual_without_population.parquet") place_populations_df = place_population.get_place_population_estimates( - data_path=data_repo_path / PLACE_POPULATION_DIR if data_repo_path else None + data_path=data_repo_path / PLACE_POPULATION_DIR ) place_populations_df = fix_nyc_boroughs_population( place_populations_df, counties_population_df