From 3e84c39ac6097d170c4d976f76f208ae36fffc8e Mon Sep 17 00:00:00 2001
From: Sid Kapur <sid@openlabsusa.org>
Date: Sat, 23 Aug 2025 13:27:53 -0400
Subject: [PATCH 1/5] Remove unnecessary get_path helper

---
 python/housing_data/data_loading_helpers.py |  7 ----
 python/housing_data/place_population.py     | 43 ++++-----------------
 python/housing_data/state_population.py     | 14 ++-----
 3 files changed, 11 insertions(+), 53 deletions(-)

diff --git a/python/housing_data/data_loading_helpers.py b/python/housing_data/data_loading_helpers.py
index 3a7419f..dd14446 100644
--- a/python/housing_data/data_loading_helpers.py
+++ b/python/housing_data/data_loading_helpers.py
@@ -34,10 +34,3 @@ def get_url_text(
     else:
         web_url = os.path.join(web_prefix, common_path)
         return requests.get(web_url).text
-
-
-def get_path(url: str, data_path: Optional[Path]) -> str:
-    if data_path is not None:
-        return str(Path(data_path, Path(url).name))
-    else:
-        return url
diff --git a/python/housing_data/place_population.py b/python/housing_data/place_population.py
index 137bd61..fcab028 100644
--- a/python/housing_data/place_population.py
+++ b/python/housing_data/place_population.py
@@ -5,15 +5,10 @@
 import numpy as np
 import pandas as pd
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_path, get_url_text
 
 
 def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame:
-    df = pd.read_fwf(
-        get_path(
-            "https://www2.census.gov/geo/tiger/PREVGENZ/pl/us_places.txt", data_path
-        )
-    )
+    df = pd.read_fwf(data_path / "us_places.txt")
 
     df["State Code"] = df["CENSUS"] // 10000
     df["Place Code"] = df["CENSUS"] % 10000
@@ -147,12 +142,8 @@ def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame:
     return df
 
 
-def _load_raw_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame:
-    tables = get_url_text(
-        "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/"
-        "2000-subcounties-evaluation-estimates/sc2000f_us.txt",
-        data_path,
-    ).split("\f")
+def _load_raw_place_populations_1990s(data_path: Path) -> pd.DataFrame:
+    tables = (data_path / "sc2000f_us.txt").read_text().split("\f")
 
     common_cols = [
         "Block",
@@ -396,14 +387,8 @@ def _melt_df(
     )
 
 
-def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/cities/sub-est00int.csv",
-            data_path,
-        ),
-        encoding="latin_1",
-    )
+def get_place_populations_2000s(data_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(data_path / "sub-est00int.csv", encoding="latin_1")
     return _melt_df(
         df,
         years=list(range(2000, 2011)),
@@ -412,26 +397,14 @@ def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame:
     )
 
 
-def get_place_populations_2010s(data_path: Optional[Path]) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/SUB-EST2020_ALL.csv",
-            data_path,
-        ),
-        encoding="latin_1",
-    )
+def get_place_populations_2010s(data_path: Path) -> pd.DataFrame:
+    df = pd.read_csv(data_path / "SUB-EST2020_ALL.csv", encoding="latin_1")
 
     return _melt_df(df, years=list(range(2010, 2021)))
 
 
 def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/sub-est2024.csv",
-            data_path,
-        ),
-        encoding="latin_1",
-    )
+    df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1")
     df = _melt_df(df, years=list(range(2020, 2025)))
     df = impute_2025_population(df)
     return df
diff --git a/python/housing_data/state_population.py b/python/housing_data/state_population.py
index fade175..cd849c7 100644
--- a/python/housing_data/state_population.py
+++ b/python/housing_data/state_population.py
@@ -4,7 +4,7 @@
 import pandas as pd
 import us
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_path, get_url_text
+from housing_data.data_loading_helpers import get_url_text
 
 DIVISIONS = {
     "New England": [
@@ -168,10 +168,7 @@ def get_state_populations_1990s(data_path: Path) -> pd.DataFrame:
 
 def get_state_populations_2000s(data_path: Path) -> pd.DataFrame:
     df = pd.read_excel(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls",
-            data_path,
-        ),
+        data_path / "st-est00int-01.xls",
         skiprows=3,
         skipfooter=8,
     )
@@ -206,12 +203,7 @@ def _melt_df(df: pd.DataFrame, years: list[int]) -> pd.DataFrame:
 
 
 def get_state_populations_2010s(data_path: Path) -> pd.DataFrame:
-    df = pd.read_csv(
-        get_path(
-            "https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/totals/nst-est2020-alldata.csv",
-            data_path,
-        )
-    )
+    df = pd.read_csv(data_path / "nst-est2020-alldata.csv")
 
     return _melt_df(df, list(range(2010, 2020)))
 

From 27a47ecd6830d9f9af2138a2467602fd06ce6dfe Mon Sep 17 00:00:00 2001
From: Sid Kapur <sid@openlabsusa.org>
Date: Sat, 23 Aug 2025 13:30:33 -0400
Subject: [PATCH 2/5] more cleanup

---
 python/housing_data/county_population.py | 7 +------
 python/housing_data/state_population.py  | 6 +-----
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/python/housing_data/county_population.py b/python/housing_data/county_population.py
index db998ef..8f2a1ca 100644
--- a/python/housing_data/county_population.py
+++ b/python/housing_data/county_population.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import us
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_url_text
 from housing_data.fips_crosswalk import load_fips_crosswalk
 
 
@@ -119,11 +118,7 @@ def get_county_fips_crosswalk(data_repo_path: Path) -> pd.DataFrame:
 
 
 def get_county_populations_1990s(data_path: Path) -> pd.DataFrame:
-    table_text = get_url_text(
-        "https://www2.census.gov/programs-surveys/popest/tables/1990-2000/counties/totals/99c8_00.txt",
-        data_path,
-        encoding="latin_1",
-    )
+    table_text = (data_path / "99c8_00.txt").read_text(encoding="latin_1")
 
     table_text = table_text[: table_text.index("Block 2")].strip()
 
diff --git a/python/housing_data/state_population.py b/python/housing_data/state_population.py
index cd849c7..f306d30 100644
--- a/python/housing_data/state_population.py
+++ b/python/housing_data/state_population.py
@@ -4,7 +4,6 @@
 import pandas as pd
 import us
 from housing_data.build_data_utils import impute_2025_population
-from housing_data.data_loading_helpers import get_url_text
 
 DIVISIONS = {
     "New England": [
@@ -75,10 +74,7 @@ def _line_to_cols(row: str) -> list[str]:
 
 
 def get_state_populations_1980s(data_path: Path) -> pd.DataFrame:
-    states_80s_text = get_url_text(
-        "https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt",
-        data_path,
-    )
+    states_80s_text = (data_path / "st8090ts.txt").read_text()
     handle = StringIO(states_80s_text)
 
     for _ in range(10):

From 2938f6ef2f54c10dea0a96d64aa7a93b5fbd9fcb Mon Sep 17 00:00:00 2001
From: Sid Kapur <sid@openlabsusa.org>
Date: Sat, 23 Aug 2025 13:35:52 -0400
Subject: [PATCH 3/5] fix mypy

---
 python/housing_data/place_population.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/housing_data/place_population.py b/python/housing_data/place_population.py
index fcab028..b062ed1 100644
--- a/python/housing_data/place_population.py
+++ b/python/housing_data/place_population.py
@@ -7,7 +7,7 @@
 from housing_data.build_data_utils import impute_2025_population
 
 
-def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame:
+def _get_places_crosswalk_df(data_path: Path) -> pd.DataFrame:
     df = pd.read_fwf(data_path / "us_places.txt")
 
     df["State Code"] = df["CENSUS"] // 10000
@@ -74,7 +74,7 @@ def get_unincorporated_places_populations_1980() -> pd.DataFrame:
     return remainder_df
 
 
-def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame:
+def get_place_populations_1980(data_path: Path) -> pd.DataFrame:
     # Assuming this is run from `python/`
     # For the header row, use the nice descriptive names that IPUMS provides rather than the code names
     df = pd.read_csv("../raw_data/nhgis0015_ds104_1980_place_070.csv", header=1)
@@ -269,7 +269,7 @@ def remove_duplicate_cities(df: pd.DataFrame) -> pd.DataFrame:
     return df[~place_state_tuples.isin(dupe_cities)]
 
 
-def get_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame:
+def get_place_populations_1990s(data_path: Path) -> pd.DataFrame:
     combined_df = _load_raw_place_populations_1990s(data_path)
 
     city_rows = (
@@ -403,7 +403,7 @@ def get_place_populations_2010s(data_path: Path) -> pd.DataFrame:
     return _melt_df(df, years=list(range(2010, 2021)))
 
 
-def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame:
+def get_place_populations_2020s(data_path: Path) -> pd.DataFrame:
     df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1")
     df = _melt_df(df, years=list(range(2020, 2025)))
     df = impute_2025_population(df)
@@ -455,7 +455,7 @@ def interpolate_1980s_populations(
     return interp_df
 
 
-def get_place_population_estimates(data_path: Optional[Path] = None) -> pd.DataFrame:
+def get_place_population_estimates(data_path: Path) -> pd.DataFrame:
     """
     Returns a DataFrame with the columns:
     - state_code (int)

From e7c256b2445de99256a3597553af722c332a4b57 Mon Sep 17 00:00:00 2001
From: Sid Kapur <sid@openlabsusa.org>
Date: Sat, 23 Aug 2025 13:36:26 -0400
Subject: [PATCH 4/5] fix mypy

---
 python/housing_data/build_places.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/housing_data/build_places.py b/python/housing_data/build_places.py
index 747ec93..3e648b1 100644
--- a/python/housing_data/build_places.py
+++ b/python/housing_data/build_places.py
@@ -293,7 +293,7 @@ def get_name_spelling(places_df: pd.DataFrame) -> pd.Series:
 
 
 def load_places(
-    data_repo_path: Optional[Path], counties_population_df: pd.DataFrame = None
+    data_repo_path: Path, counties_population_df: pd.DataFrame = None
 ) -> tuple[pd.DataFrame, pd.DataFrame]:
     raw_places_df = pd.concat(
         [

From d706a805f9759c38f3640ce9c77f8c2f5dc33f59 Mon Sep 17 00:00:00 2001
From: Sid Kapur <sid@openlabsusa.org>
Date: Sat, 23 Aug 2025 13:42:13 -0400
Subject: [PATCH 5/5] fix mypy?

---
 python/housing_data/build_places.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/housing_data/build_places.py b/python/housing_data/build_places.py
index 3e648b1..40d30c4 100644
--- a/python/housing_data/build_places.py
+++ b/python/housing_data/build_places.py
@@ -310,7 +310,7 @@ def load_places(
     raw_places_df.to_parquet(PUBLIC_DIR / "places_annual_without_population.parquet")
 
     place_populations_df = place_population.get_place_population_estimates(
-        data_path=data_repo_path / PLACE_POPULATION_DIR if data_repo_path else None
+        data_path=data_repo_path / PLACE_POPULATION_DIR
     )
     place_populations_df = fix_nyc_boroughs_population(
         place_populations_df, counties_population_df