Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions uk_bin_collection/tests/input.json
Original file line number Diff line number Diff line change
Expand Up @@ -1344,14 +1344,11 @@
"LAD24CD": "E07000121"
},
"LeedsCityCouncil": {
"house_number": "1",
"postcode": "LS6 2SE",
"skip_get_url": true,
"uprn": "72506983",
"url": "https://www.leeds.gov.uk/residents/bins-and-recycling/check-your-bin-day",
"web_driver": "http://selenium:4444",
"wiki_name": "Leeds",
"wiki_note": "Pass the house number, postcode, and UPRN. This parser requires a Selenium webdriver.",
"wiki_note": "Pass the UPRN.",
"LAD24CD": "E08000035"
},
"LeicesterCityCouncil": {
Expand Down Expand Up @@ -1420,6 +1417,14 @@
"wiki_note": "Pass the UPRN. You can find it using [FindMyAddress](https://www.findmyaddress.co.uk/search).",
"LAD24CD": "E09000009"
},
"LondonBoroughHammersmithandFulham": {
"postcode": "W12 0BQ",
"url": "https://www.lbhf.gov.uk/",
"wiki_command_url_override": "https://www.lbhf.gov.uk/",
"wiki_name": "Hammersmith & Fulham",
"wiki_note": "Pass only the property postcode",
"LAD24CD": "E09000013"
},
Comment on lines +1420 to +1427
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add skip_get_url: true to avoid a prefetch that can fail before parsing.
The parser performs its own request; without skip_get_url, a failed prefetch to the base URL can block the flow or add unnecessary latency. Consider adding skip_get_url: true so parse_data runs directly (the auto-generated wiki entry will then include -s).

🛠️ Suggested change
 "LondonBoroughHammersmithandFulham": {
     "postcode": "W12 0BQ",
+    "skip_get_url": true,
     "url": "https://www.lbhf.gov.uk/",
     "wiki_command_url_override": "https://www.lbhf.gov.uk/",
     "wiki_name": "Hammersmith & Fulham",
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
"LondonBoroughHammersmithandFulham": {
"postcode": "W12 0BQ",
"url": "https://www.lbhf.gov.uk/",
"wiki_command_url_override": "https://www.lbhf.gov.uk/",
"wiki_name": "Hammersmith & Fulham",
"wiki_note": "Pass only the property postcode",
"LAD24CD": "E09000013"
},
"LondonBoroughHammersmithandFulham": {
"postcode": "W12 0BQ",
"skip_get_url": true,
"url": "https://www.lbhf.gov.uk/",
"wiki_command_url_override": "https://www.lbhf.gov.uk/",
"wiki_name": "Hammersmith & Fulham",
"wiki_note": "Pass only the property postcode",
"LAD24CD": "E09000013"
},
🤖 Prompt for AI Agents
In `@uk_bin_collection/tests/input.json` around lines 1411 - 1418, The entry for
"LondonBoroughHammersmithandFulham" is missing skip_get_url and may trigger a
prefetch failure; add "skip_get_url": true to that object so the parser's
parse_data step runs directly and avoids the automatic GET to the base url (this
will also cause the auto-generated wiki entry to include -s). Update the JSON
object for LondonBoroughHammersmithandFulham to include the skip_get_url key
alongside postcode, url, wiki_command_url_override, wiki_name, wiki_note, and
LAD24CD.

"LondonBoroughHarrow": {
"uprn": "100021298754",
"url": "https://www.harrow.gov.uk",
Expand Down
20 changes: 2 additions & 18 deletions uk_bin_collection/uk_bin_collection/councils/BarkingDagenham.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,23 +47,7 @@ def parse_data(self, page: str, **kwargs) -> dict:
# Close popup if it exists
driver.switch_to.active_element.send_keys(Keys.ESCAPE)

# Handle cookie banner if present
wait = WebDriverWait(driver, 60)
try:
cookie_button = wait.until(
EC.element_to_be_clickable(
(
By.CSS_SELECTOR,
".agree-button.eu-cookie-compliance-secondary-button.button.button--small",
)
),
message="Cookie banner not found",
)
cookie_button.click()
print("Cookie banner clicked.")
time.sleep(1) # Brief pause to let banner disappear
except (TimeoutException, NoSuchElementException):
print("No cookie banner appeared or selector failed.")
wait = WebDriverWait(driver, 10)

# Enter postcode
print("Looking for postcode input...")
Expand All @@ -84,7 +68,7 @@ def parse_data(self, page: str, **kwargs) -> dict:
EC.element_to_be_clickable((By.ID, "address")),
message="Address dropdown not found",
)

dropdown = Select(address_select)

found = False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ def parse_data(self, page: str, **kwargs) -> dict:
data = {"bins": []}

# Get our initial session running
driver = create_webdriver(web_driver, headless, None, __name__)
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
driver = create_webdriver(web_driver, headless, user_agent, __name__)
Comment on lines +33 to +34
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Check all hardcoded user_agent strings in council scrapers
rg -n 'user_agent\s*=' --type=py uk_bin_collection/uk_bin_collection/councils/ -A 1 | head -50

Repository: robbrad/UKBinCollectionData

Length of output: 5269


🏁 Script executed:

#!/bin/bash
# Check what Chrome versions are used in the codebase
rg 'Chrome/[0-9]+' --type=py uk_bin_collection/uk_bin_collection/councils/ | sort | uniq -c

Repository: robbrad/UKBinCollectionData

Length of output: 13582


🏁 Script executed:

#!/bin/bash
# Check if there are any shared constants or utilities for user_agent
rg -n 'user_agent|USER_AGENT' --type=py uk_bin_collection/uk_bin_collection/ | grep -E '(const|CONSTANT|=)' | head -20

Repository: robbrad/UKBinCollectionData

Length of output: 3004


Hardcoded user-agent references Chrome 120, which is outdated (14 months old, with Chrome 143+ in active use elsewhere in the codebase).

Chrome 120 was released in December 2023. Multiple other council scrapers in the codebase have since moved to Chrome 121–143. Consider updating to a more current version (e.g., Chrome 141–143) to stay aligned with contemporary releases, or extract this into a shared constant similar to the pattern in get_bin_data.py.

Example UA bump
-            user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+            user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
driver = create_webdriver(web_driver, headless, user_agent, __name__)
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36"
driver = create_webdriver(web_driver, headless, user_agent, __name__)
🧰 Tools
🪛 Ruff (0.15.0)

[error] 34-34: create_webdriver may be undefined, or defined from star imports

(F405)

🤖 Prompt for AI Agents
In `@uk_bin_collection/uk_bin_collection/councils/BromleyBoroughCouncil.py` around
lines 33 - 34, The hardcoded user_agent string assigned to user_agent before
calling create_webdriver is stuck at Chrome 120; update it to a current Chrome
version (e.g., Chrome/143) or, better, replace the literal with the shared
constant/pattern used elsewhere (extract to and import the UA constant used in
get_bin_data.py) so BromleyBoroughCouncil.py sets user_agent consistently before
calling create_webdriver(web_driver, headless, user_agent, __name__); ensure the
new value follows the same format as other scrapers and update any imports if
you extract the constant.

driver.get(kwargs.get("url"))

wait = WebDriverWait(driver, 30)
Expand Down
74 changes: 19 additions & 55 deletions uk_bin_collection/uk_bin_collection/councils/CumberlandCouncil.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import date

import requests
from bs4 import BeautifulSoup

Expand Down Expand Up @@ -29,64 +31,26 @@ def parse_data(self, page: str, **kwargs) -> dict:
if not content_region:
return bindata

# Parse the text content to extract collection dates
text_content = content_region.get_text()
lines = [line.strip() for line in text_content.split('\n') if line.strip()]

current_month = None
current_year = None
i = 0

# Determine the year range from the page header
year_2026 = "2026" in text_content

while i < len(lines):
line = lines[i]

# Check if this is a month name
if line in ["January", "February", "March", "April", "May", "June",
"July", "August", "September", "October", "November", "December"]:
current_month = line
# Determine year based on month and context
if year_2026:
current_year = "2026" if line in ["January", "February"] else "2025"
else:
current_year = str(datetime.now().year)
i += 1
continue

# Check if this is a day number (1-31)
if line.isdigit() and 1 <= int(line) <= 31 and current_month:
day = line
# Next line should be the bin type
if i + 1 < len(lines):
bin_type = lines[i + 1]

# Skip the subtype line (Refuse/Recycling detail)
if i + 2 < len(lines) and lines[i + 2] in ["Refuse", "Recycling"]:
i += 1

# Parse the date
try:
date_str = f"{day} {current_month} {current_year}"
collection_date = datetime.strptime(date_str, "%d %B %Y")

dict_data = {
"type": bin_type,
"collectionDate": collection_date.strftime(date_format),
}
bindata["bins"].append(dict_data)
except ValueError:
pass

i += 2
continue

i += 1
lis = content_region.find_all("li")
for li in lis:
collection_day = li.find("span", class_="waste-collection__day--day")
collection_type_str = li.find("span", class_="waste-collection__day--type")

collection_date = collection_day.find("time")["datetime"]

collection_type = collection_type_str.text

collection_date = datetime.strptime(collection_date, "%Y-%m-%d")

dict_data = {
"type": collection_type.strip(),
"collectionDate": collection_date.strftime(date_format),
}
bindata["bins"].append(dict_data)

# Sort by collection date
bindata["bins"].sort(
key=lambda x: datetime.strptime(x.get("collectionDate"), "%d/%m/%Y")
key=lambda x: datetime.strptime(x.get("collectionDate"), date_format)
)

return bindata
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def parse_data(self, page: str, **kwargs) -> dict:
headless = kwargs.get("headless")
web_driver = kwargs.get("web_driver")
url = f"https://www.eastleigh.gov.uk/waste-bins-and-recycling/collection-dates/your-waste-bin-and-recycling-collections?uprn={uprn}"
driver = create_webdriver(web_driver, headless, None, __name__)
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
driver = create_webdriver(web_driver, headless, user_agent, __name__)
driver.get(url)

wait = WebDriverWait(driver, 10)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,18 @@ def parse_data(self, page: str, **kwargs) -> dict:
check_uprn(user_uprn)
bindata = {"bins": []}

URI = "https://harborough.fccenvironment.co.uk/detail-address"
URI1 = "https://harborough.fccenvironment.co.uk/"
URI2 = "https://harborough.fccenvironment.co.uk/detail-address"

# Make the GET request
session = requests.session()
response = session.get(
URI1, verify=False
) # Initialize session state (cookies) required by URI2
response.raise_for_status() # Validate session initialization

headers = {
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0",
"Referer": "https://harborough.fccenvironment.co.uk/",
}
params = {"Uprn": user_uprn}
response = requests.post(URI, headers=headers, data=params, verify=False)
response = session.post(URI2, data=params, verify=False)

# Check for service errors
if response.status_code == 502:
Expand All @@ -40,20 +43,20 @@ def parse_data(self, page: str, **kwargs) -> dict:
f"This is a temporary issue with the council's waste collection system. "
f"Please try again later."
)

response.raise_for_status()

soup = BeautifulSoup(response.content, features="html.parser")
bin_collection = soup.find(
"div", {"class": "blocks block-your-next-scheduled-bin-collection-days"}
)

if bin_collection is None:
raise ValueError(
f"Could not find bin collection data for UPRN {user_uprn}. "
"The council website may have changed or the UPRN may be invalid."
)

lis = bin_collection.find_all("li")
for li in lis:
try:
Expand Down
121 changes: 35 additions & 86 deletions uk_bin_collection/uk_bin_collection/councils/LeedsCityCouncil.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,5 @@
import urllib.request
from datetime import datetime

import pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait

from uk_bin_collection.uk_bin_collection.common import *
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass

Expand All @@ -22,92 +13,50 @@ class CouncilClass(AbstractGetBinDataClass):

def parse_data(self, page: str, **kwargs) -> dict:
driver = None
data = {"bins": []}
try:
"""
Parse council provided CSVs to get the latest bin collections for address
"""

user_uprn = kwargs.get("uprn")
user_postcode = kwargs.get("postcode")
web_driver = kwargs.get("web_driver")
headless = kwargs.get("headless")
check_uprn(user_uprn)
check_postcode(user_postcode)
# Create Selenium webdriver
page = f"https://www.leeds.gov.uk/residents/bins-and-recycling/check-your-bin-day"

driver = create_webdriver(web_driver, headless, None, __name__)
driver.get(page)

wait = WebDriverWait(driver, 60)
postcode_box = wait.until(
EC.element_to_be_clickable(
(
By.XPATH,
"//input[@id='postcode']",
)
)
)
postcode_box.send_keys(user_postcode)
postcode_btn_present = wait.until(
EC.presence_of_element_located(
(
By.XPATH,
"//button[contains(text(),'Look up Address')]",
)
)
)

postcode_btn_present.send_keys(Keys.RETURN)
URI = "https://api.leeds.gov.uk/public/waste/v1/BinsDays"

dropdown_present = wait.until(
EC.presence_of_element_located(
(
By.XPATH,
'//option[contains(text(),"Select an address")]/parent::select',
)
)
)
startDate = datetime.now()
endDate = (startDate + timedelta(weeks=8)).strftime("%Y-%m-%d")
startDate = startDate.strftime("%Y-%m-%d")

params = {
"uprn": user_uprn,
"startDate": startDate,
"endDate": endDate,
}

headers = {
"ocp-apim-subscription-key": "ad8dd80444fe45fcad376f82cf9a5ab4",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36",
}

dropdown_select = Select(dropdown_present)
# print(params)

dropdown_select.select_by_value(user_uprn)
# Send GET request
response = requests.get(URI, params=params, headers=headers)

result = wait.until(
EC.presence_of_element_located(
(
By.XPATH,
"//div[@class='lcc-bins']",
)
print(response.content)

collections = json.loads(response.content)

for collection in collections:

collectionDate = datetime.strptime(
collection["date"], "%Y-%m-%dT%H:%M:%S"
)
)

data = {"bins": []} # dictionary for data
soup = BeautifulSoup(
result.get_attribute("innerHTML"), features="html.parser"
)

bin_sections = soup.select("div.lcc-bin:not(.lcc-bin--calendar)")

for section in bin_sections:
h3_text = section.find("h3").get_text(strip=True)
bin_type = h3_text.split()[0] # e.g., 'Black', 'Brown', 'Green'

# Find all <li> elements inside the bin days list
date_elements = section.select("div.lcc-bin__days li")
for li in date_elements:
raw_date = li.get_text(strip=True)
if not raw_date:
continue
try:
formatted_date = datetime.strptime(
raw_date, "%A %d %b %Y"
).strftime(date_format)
data["bins"].append(
{"type": bin_type, "collectionDate": formatted_date}
)
except ValueError:
print(f"Skipping unparseable date: {raw_date}")

data["bins"].append(
{
"type": collection["type"],
"collectionDate": collectionDate.strftime(date_format),
}
)

except Exception as e:
# Here you can log the exception if needed
print(f"An error occurred: {e}")
Expand Down
Loading
Loading