Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/housing_data/build_places.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def get_name_spelling(places_df: pd.DataFrame) -> pd.Series:


def load_places(
data_repo_path: Optional[Path], counties_population_df: pd.DataFrame = None
data_repo_path: Path, counties_population_df: pd.DataFrame = None
) -> tuple[pd.DataFrame, pd.DataFrame]:
raw_places_df = pd.concat(
[
Expand All @@ -310,7 +310,7 @@ def load_places(
raw_places_df.to_parquet(PUBLIC_DIR / "places_annual_without_population.parquet")

place_populations_df = place_population.get_place_population_estimates(
data_path=data_repo_path / PLACE_POPULATION_DIR if data_repo_path else None
data_path=data_repo_path / PLACE_POPULATION_DIR
)
place_populations_df = fix_nyc_boroughs_population(
place_populations_df, counties_population_df
Expand Down
7 changes: 1 addition & 6 deletions python/housing_data/county_population.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pandas as pd
import us
from housing_data.build_data_utils import impute_2025_population
from housing_data.data_loading_helpers import get_url_text
from housing_data.fips_crosswalk import load_fips_crosswalk


Expand Down Expand Up @@ -119,11 +118,7 @@ def get_county_fips_crosswalk(data_repo_path: Path) -> pd.DataFrame:


def get_county_populations_1990s(data_path: Path) -> pd.DataFrame:
table_text = get_url_text(
"https://www2.census.gov/programs-surveys/popest/tables/1990-2000/counties/totals/99c8_00.txt",
data_path,
encoding="latin_1",
)
table_text = (data_path / "99c8_00.txt").read_text(encoding="latin_1")

table_text = table_text[: table_text.index("Block 2")].strip()

Expand Down
7 changes: 0 additions & 7 deletions python/housing_data/data_loading_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,3 @@ def get_url_text(
else:
web_url = os.path.join(web_prefix, common_path)
return requests.get(web_url).text


def get_path(url: str, data_path: Optional[Path]) -> str:
if data_path is not None:
return str(Path(data_path, Path(url).name))
else:
return url
53 changes: 13 additions & 40 deletions python/housing_data/place_population.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,10 @@
import numpy as np
import pandas as pd
from housing_data.build_data_utils import impute_2025_population
from housing_data.data_loading_helpers import get_path, get_url_text


def _get_places_crosswalk_df(data_path: Optional[Path] = None) -> pd.DataFrame:
df = pd.read_fwf(
get_path(
"https://www2.census.gov/geo/tiger/PREVGENZ/pl/us_places.txt", data_path
)
)
def _get_places_crosswalk_df(data_path: Path) -> pd.DataFrame:
df = pd.read_fwf(data_path / "us_places.txt")

df["State Code"] = df["CENSUS"] // 10000
df["Place Code"] = df["CENSUS"] % 10000
Expand Down Expand Up @@ -79,7 +74,7 @@ def get_unincorporated_places_populations_1980() -> pd.DataFrame:
return remainder_df


def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame:
def get_place_populations_1980(data_path: Path) -> pd.DataFrame:
# Assuming this is run from `python/`
# For the header row, use the nice descriptive names that IPUMS provides rather than the code names
df = pd.read_csv("../raw_data/nhgis0015_ds104_1980_place_070.csv", header=1)
Expand Down Expand Up @@ -147,12 +142,8 @@ def get_place_populations_1980(data_path: Optional[Path]) -> pd.DataFrame:
return df


def _load_raw_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame:
tables = get_url_text(
"https://www2.census.gov/programs-surveys/popest/tables/1990-2000/"
"2000-subcounties-evaluation-estimates/sc2000f_us.txt",
data_path,
).split("\f")
def _load_raw_place_populations_1990s(data_path: Path) -> pd.DataFrame:
tables = (data_path / "sc2000f_us.txt").read_text().split("\f")

common_cols = [
"Block",
Expand Down Expand Up @@ -278,7 +269,7 @@ def remove_duplicate_cities(df: pd.DataFrame) -> pd.DataFrame:
return df[~place_state_tuples.isin(dupe_cities)]


def get_place_populations_1990s(data_path: Optional[Path]) -> pd.DataFrame:
def get_place_populations_1990s(data_path: Path) -> pd.DataFrame:
combined_df = _load_raw_place_populations_1990s(data_path)

city_rows = (
Expand Down Expand Up @@ -396,14 +387,8 @@ def _melt_df(
)


def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame:
df = pd.read_csv(
get_path(
"https://www2.census.gov/programs-surveys/popest/datasets/2000-2010/intercensal/cities/sub-est00int.csv",
data_path,
),
encoding="latin_1",
)
def get_place_populations_2000s(data_path: Path) -> pd.DataFrame:
df = pd.read_csv(data_path / "sub-est00int.csv", encoding="latin_1")
return _melt_df(
df,
years=list(range(2000, 2011)),
Expand All @@ -412,26 +397,14 @@ def get_place_populations_2000s(data_path: Optional[Path]) -> pd.DataFrame:
)


def get_place_populations_2010s(data_path: Optional[Path]) -> pd.DataFrame:
df = pd.read_csv(
get_path(
"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/SUB-EST2020_ALL.csv",
data_path,
),
encoding="latin_1",
)
def get_place_populations_2010s(data_path: Path) -> pd.DataFrame:
df = pd.read_csv(data_path / "SUB-EST2020_ALL.csv", encoding="latin_1")

return _melt_df(df, years=list(range(2010, 2021)))


def get_place_populations_2020s(data_path: Optional[Path]) -> pd.DataFrame:
df = pd.read_csv(
get_path(
"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/cities/sub-est2024.csv",
data_path,
),
encoding="latin_1",
)
def get_place_populations_2020s(data_path: Path) -> pd.DataFrame:
df = pd.read_csv(data_path / "sub-est2024.csv", encoding="latin_1")
df = _melt_df(df, years=list(range(2020, 2025)))
df = impute_2025_population(df)
return df
Expand Down Expand Up @@ -482,7 +455,7 @@ def interpolate_1980s_populations(
return interp_df


def get_place_population_estimates(data_path: Optional[Path] = None) -> pd.DataFrame:
def get_place_population_estimates(data_path: Path) -> pd.DataFrame:
"""
Returns a DataFrame with the columns:
- state_code (int)
Expand Down
18 changes: 3 additions & 15 deletions python/housing_data/state_population.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pandas as pd
import us
from housing_data.build_data_utils import impute_2025_population
from housing_data.data_loading_helpers import get_path, get_url_text

DIVISIONS = {
"New England": [
Expand Down Expand Up @@ -75,10 +74,7 @@ def _line_to_cols(row: str) -> list[str]:


def get_state_populations_1980s(data_path: Path) -> pd.DataFrame:
states_80s_text = get_url_text(
"https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st8090ts.txt",
data_path,
)
states_80s_text = (data_path / "st8090ts.txt").read_text()
handle = StringIO(states_80s_text)

for _ in range(10):
Expand Down Expand Up @@ -168,10 +164,7 @@ def get_state_populations_1990s(data_path: Path) -> pd.DataFrame:

def get_state_populations_2000s(data_path: Path) -> pd.DataFrame:
df = pd.read_excel(
get_path(
"https://www2.census.gov/programs-surveys/popest/tables/2000-2010/intercensal/state/st-est00int-01.xls",
data_path,
),
data_path / "st-est00int-01.xls",
skiprows=3,
skipfooter=8,
)
Expand Down Expand Up @@ -206,12 +199,7 @@ def _melt_df(df: pd.DataFrame, years: list[int]) -> pd.DataFrame:


def get_state_populations_2010s(data_path: Path) -> pd.DataFrame:
df = pd.read_csv(
get_path(
"https://www2.census.gov/programs-surveys/popest/datasets/2010-2020/state/totals/nst-est2020-alldata.csv",
data_path,
)
)
df = pd.read_csv(data_path / "nst-est2020-alldata.csv")

return _melt_df(df, list(range(2010, 2020)))

Expand Down
Loading