-
Notifications
You must be signed in to change notification settings - Fork 199
Fix Richmond Upon Thames Council #1849
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -1,100 +1,167 @@ | ||||||||||||||||||||
| from selenium.webdriver.common.by import By | ||||||||||||||||||||
| from selenium.webdriver.support import expected_conditions as EC | ||||||||||||||||||||
| # uk_bin_collection/uk_bin_collection/councils/richmond_gov_uk.py | ||||||||||||||||||||
|
|
||||||||||||||||||||
| import re | ||||||||||||||||||||
| import html as html_unescape | ||||||||||||||||||||
| from datetime import datetime | ||||||||||||||||||||
| from selenium.webdriver.support.wait import WebDriverWait | ||||||||||||||||||||
| from selenium.webdriver.common.keys import Keys | ||||||||||||||||||||
| from uk_bin_collection.uk_bin_collection.common import * | ||||||||||||||||||||
| from urllib.parse import urlparse, parse_qs | ||||||||||||||||||||
|
|
||||||||||||||||||||
| import requests | ||||||||||||||||||||
|
|
||||||||||||||||||||
| from uk_bin_collection.uk_bin_collection.common import date_format | ||||||||||||||||||||
| from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass | ||||||||||||||||||||
|
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| class CouncilClass(AbstractGetBinDataClass): | ||||||||||||||||||||
| """ | ||||||||||||||||||||
| Richmond upon Thames – parse the static My Property page. | ||||||||||||||||||||
| No Selenium. No BeautifulSoup. Just requests + regex tailored to the current markup. | ||||||||||||||||||||
| """ | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def parse_data(self, page: str, **kwargs) -> dict: | ||||||||||||||||||||
| print(f"Arguments are f{kwargs}") | ||||||||||||||||||||
| driver = None | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| page = kwargs["url"] | ||||||||||||||||||||
| street_name = kwargs.get("paon") | ||||||||||||||||||||
| web_driver = kwargs.get("web_driver") | ||||||||||||||||||||
| headless = kwargs.get("headless") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| driver = create_webdriver(web_driver, headless, None, __name__) | ||||||||||||||||||||
| driver.get(page) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| wait = WebDriverWait(driver, 60) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| self.dismiss_cookie_banner(wait) | ||||||||||||||||||||
| self.input_street_name(street_name, wait) | ||||||||||||||||||||
| self.submit(wait) | ||||||||||||||||||||
| bin_types, collection_days = self.get_bins(driver) | ||||||||||||||||||||
| bindata = self.get_collection_days(bin_types, collection_days) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| print(bindata) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| except Exception as e: | ||||||||||||||||||||
| # Here you can log the exception if needed | ||||||||||||||||||||
| print(f"An error occurred: {e}") | ||||||||||||||||||||
| # Optionally, re-raise the exception if you want it to propagate | ||||||||||||||||||||
| raise | ||||||||||||||||||||
| finally: | ||||||||||||||||||||
| # This block ensures that the driver is closed regardless of an exception | ||||||||||||||||||||
| if driver: | ||||||||||||||||||||
| driver.quit() | ||||||||||||||||||||
| return bindata | ||||||||||||||||||||
| base_url = kwargs.get("url") or page | ||||||||||||||||||||
| pid_arg = kwargs.get("pid") | ||||||||||||||||||||
| paon = kwargs.get("paon") | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def get_collection_days(self, bin_types, collection_days): | ||||||||||||||||||||
| bindata = {"bins": []} | ||||||||||||||||||||
| WEEKLY_COLLECTION = 0 | ||||||||||||||||||||
| GARDEN_COLLECTION = 1 | ||||||||||||||||||||
| # work out final URL, but DO NOT add #my_waste | ||||||||||||||||||||
| pid_from_url = self._pid_from_url(base_url) | ||||||||||||||||||||
| pid_from_paon = self._pid_from_paon(paon) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| for index, bin_type in enumerate(bin_types): | ||||||||||||||||||||
| # currently only handled weekly and garden collection, special collections like Christmas Day need to be added | ||||||||||||||||||||
| if index == WEEKLY_COLLECTION: | ||||||||||||||||||||
| next_collection_date = get_next_day_of_week( | ||||||||||||||||||||
| collection_days[index].text.strip(), date_format | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| elif index == GARDEN_COLLECTION: | ||||||||||||||||||||
| split_date_part = collection_days[index].text.split("More dates")[0] | ||||||||||||||||||||
| next_collection_date = datetime.strptime( | ||||||||||||||||||||
| split_date_part.strip(), "%d %B %Y" | ||||||||||||||||||||
| ).strftime(date_format) | ||||||||||||||||||||
| else: | ||||||||||||||||||||
| next_collection_date = datetime.strptime( | ||||||||||||||||||||
| collection_days[index].text.strip(), "%d %B %Y" | ||||||||||||||||||||
| ).strftime(date_format) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| dict_data = { | ||||||||||||||||||||
| "type": bin_type.text.strip(), | ||||||||||||||||||||
| "collectionDate": next_collection_date, | ||||||||||||||||||||
| } | ||||||||||||||||||||
| bindata["bins"].append(dict_data) | ||||||||||||||||||||
| if "pid=" in (base_url or ""): | ||||||||||||||||||||
| target_url = base_url | ||||||||||||||||||||
| elif pid_arg or pid_from_paon: | ||||||||||||||||||||
| pid = pid_arg or pid_from_paon | ||||||||||||||||||||
| sep = "&" if "?" in (base_url or "") else "?" | ||||||||||||||||||||
| target_url = f"{base_url}{sep}pid={pid}" | ||||||||||||||||||||
| else: | ||||||||||||||||||||
| raise ValueError( | ||||||||||||||||||||
| "Richmond: supply a URL that already has ?pid=... OR put PID in the House Number field." | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
Comment on lines
+21
to
+38
|
||||||||||||||||||||
|
|
||||||||||||||||||||
| html = self._fetch_html(target_url) | ||||||||||||||||||||
| bindata = self._parse_html_for_waste(html) | ||||||||||||||||||||
| if not bindata["bins"]: | ||||||||||||||||||||
| raise RuntimeError("Richmond: no bins found in page HTML.") | ||||||||||||||||||||
| return bindata | ||||||||||||||||||||
|
Comment on lines
20
to
44
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: # First, let's examine the complete file to understand the class structure
fd -n "LondonBoroughOfRichmondUponThames.py" -x cat -n {}Repository: robbrad/UKBinCollectionData Length of output: 301 🏁 Script executed: # Find the base class to understand get_and_parse_data
fd -n "\.py" -path "*/uk_bin_collection/*" | grep -i "base\|council" | head -20Repository: robbrad/UKBinCollectionData Length of output: 301 🏁 Script executed: # Search for how skip_get_url is used across the codebase
rg -n "skip_get_url" --type py -B 2 -A 2Repository: robbrad/UKBinCollectionData Length of output: 15505 🏁 Script executed: # Let's examine the Richmond Upon Thames file directly
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.pyRepository: robbrad/UKBinCollectionData Length of output: 6122 🏁 Script executed: # Let's look at the base class get_and_parse_data method to understand the flow
cat uk_bin_collection/uk_bin_collection/get_bin_data.py | head -120Repository: robbrad/UKBinCollectionData Length of output: 3737 🏁 Script executed: # Check if pid_from_url is used anywhere after line 26 in the Richmond file
rg "pid_from_url" uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.pyRepository: robbrad/UKBinCollectionData Length of output: 182 🏁 Script executed: # Check if LondonBoroughOfRichmondUponThames.py exists as a separate file
fd "LondonBoroughOfRichmondUponThames" --type fRepository: robbrad/UKBinCollectionData Length of output: 152 🏁 Script executed: # Also search for any Council files related to Richmond
fd "Richmond\|richmond" --type f uk_bin_collection/uk_bin_collection/councils/Repository: robbrad/UKBinCollectionData Length of output: 53 🏁 Script executed: # Now let's check the actual LondonBoroughOfRichmondUponThames.py file
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.pyRepository: robbrad/UKBinCollectionData Length of output: 6122 🏁 Script executed: # Verify the content matches what was shown in the review comment
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py | head -50Repository: robbrad/UKBinCollectionData Length of output: 1757 🏁 Script executed: # Check what's actually in the LondonBoroughOfRichmondUponThames.py file
file uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.pyRepository: robbrad/UKBinCollectionData Length of output: 115 🏁 Script executed: # Try to see if it's a symlink or something
ls -la uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.pyRepository: robbrad/UKBinCollectionData Length of output: 201 🏁 Script executed: # Let's check the first few lines of both files to understand
echo "=== LondonBoroughOfRichmondUponThames.py ===" && head -5 uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py && echo "" && echo "=== richmond_gov_uk.py ===" && head -5 uk_bin_collection/uk_bin_collection/councils/richmond_gov_uk.pyRepository: robbrad/UKBinCollectionData Length of output: 419 🏁 Script executed: # Let's get the full content of the actual file
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.pyRepository: robbrad/UKBinCollectionData Length of output: 6122 Bug: Two issues in
Proposed fix def parse_data(self, page: str, **kwargs) -> dict:
base_url = kwargs.get("url") or page
pid_arg = kwargs.get("pid")
paon = kwargs.get("paon")
# work out final URL, but DO NOT add `#my_waste`
pid_from_url = self._pid_from_url(base_url)
pid_from_paon = self._pid_from_paon(paon)
if "pid=" in (base_url or ""):
target_url = base_url
- elif pid_arg or pid_from_paon:
- pid = pid_arg or pid_from_paon
+ elif pid_arg or pid_from_url or pid_from_paon:
+ pid = pid_arg or pid_from_url or pid_from_paon
sep = "&" if "?" in (base_url or "") else "?"
target_url = f"{base_url}{sep}pid={pid}"
else:
raise ValueError(
"Richmond: supply a URL that already has ?pid=... OR put PID in the House Number field."
)To avoid the double request, set 🧰 Tools🪛 Ruff (0.14.14)[error] 26-26: Local variable Remove assignment to unused variable (F841) [warning] 36-38: Avoid specifying long messages outside the exception class (TRY003) [warning] 43-43: Avoid specifying long messages outside the exception class (TRY003) 🤖 Prompt for AI Agents |
||||||||||||||||||||
|
|
||||||||||||||||||||
| def get_bins(self, driver): | ||||||||||||||||||||
| table = driver.find_element(By.XPATH, ".//div[@id='maincontent']//table") | ||||||||||||||||||||
| table_rows = table.find_elements(by=By.TAG_NAME, value="tr") | ||||||||||||||||||||
| headerRow = table_rows[0] | ||||||||||||||||||||
| table_info_row = table_rows[1] | ||||||||||||||||||||
| bin_types = headerRow.find_elements(by=By.TAG_NAME, value="th")[2:] | ||||||||||||||||||||
| collection_days = table_info_row.find_elements(by=By.TAG_NAME, value="td")[2:] | ||||||||||||||||||||
| return bin_types, collection_days | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def submit(self, wait): | ||||||||||||||||||||
| main_content_submit_button = wait.until( | ||||||||||||||||||||
| EC.element_to_be_clickable( | ||||||||||||||||||||
| (By.XPATH, ".//div[@id='maincontent']//input[@type='submit']") | ||||||||||||||||||||
| # ----------------- HTTP ----------------- | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _fetch_html(self, url: str) -> str: | ||||||||||||||||||||
| headers = { | ||||||||||||||||||||
| "User-Agent": ( | ||||||||||||||||||||
| "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " | ||||||||||||||||||||
| "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| main_content_submit_button.send_keys(Keys.ENTER) | ||||||||||||||||||||
| } | ||||||||||||||||||||
| resp = requests.get(url, headers=headers, timeout=30) | ||||||||||||||||||||
| resp.raise_for_status() | ||||||||||||||||||||
| return resp.text | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # ----------------- parsing (regex) ----------------- | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _parse_html_for_waste(self, html: str) -> dict: | ||||||||||||||||||||
| # isolate the waste block between <a id="my_waste"> and next section | ||||||||||||||||||||
| waste_block = self._extract_waste_block(html) | ||||||||||||||||||||
| if not waste_block: | ||||||||||||||||||||
| return {"bins": []} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| bins = [] | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def input_street_name(self, street_name, wait): | ||||||||||||||||||||
| input_element_postcodesearch = wait.until( | ||||||||||||||||||||
| EC.visibility_of_element_located((By.ID, "Street")) | ||||||||||||||||||||
| # find all <h4>...</h4> in that block | ||||||||||||||||||||
| for h_match in re.finditer(r"<h4>(.*?)</h4>", waste_block, flags=re.I | re.S): | ||||||||||||||||||||
| bin_name = self._clean(h_match.group(1)) | ||||||||||||||||||||
| if not bin_name: | ||||||||||||||||||||
| continue | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # slice from end of this <h4> to either next <h4> or end of block | ||||||||||||||||||||
| start = h_match.end() | ||||||||||||||||||||
| # find next h4 after this one | ||||||||||||||||||||
| next_h = re.search(r"<h4>", waste_block[start:], flags=re.I) | ||||||||||||||||||||
| if next_h: | ||||||||||||||||||||
| section = waste_block[start : start + next_h.start()] | ||||||||||||||||||||
| else: | ||||||||||||||||||||
| section = waste_block[start:] | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # try to find <ul> ... <li>...</li> ... </ul> | ||||||||||||||||||||
| date_lines = [] | ||||||||||||||||||||
| ul_match = re.search(r"<ul[^>]*>(.*?)</ul>", section, flags=re.I | re.S) | ||||||||||||||||||||
| if ul_match: | ||||||||||||||||||||
| ul_inner = ul_match.group(1) | ||||||||||||||||||||
| for li in re.findall(r"<li[^>]*>(.*?)</li>", ul_inner, flags=re.I | re.S): | ||||||||||||||||||||
| text = self._clean(li) | ||||||||||||||||||||
| if text: | ||||||||||||||||||||
| date_lines.append(text) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # fallback to <p>...</p> | ||||||||||||||||||||
| if not date_lines: | ||||||||||||||||||||
| p_match = re.search(r"<p[^>]*>(.*?)</p>", section, flags=re.I | re.S) | ||||||||||||||||||||
| if p_match: | ||||||||||||||||||||
| text = self._clean(p_match.group(1)) | ||||||||||||||||||||
| if text: | ||||||||||||||||||||
| date_lines.append(text) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| col_date = self._first_date_or_message(date_lines) | ||||||||||||||||||||
| if col_date: | ||||||||||||||||||||
| bins.append( | ||||||||||||||||||||
| { | ||||||||||||||||||||
| "type": bin_name, | ||||||||||||||||||||
| "collectionDate": col_date, | ||||||||||||||||||||
| } | ||||||||||||||||||||
| ) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| return {"bins": bins} | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _extract_waste_block(self, html: str) -> str | None: | ||||||||||||||||||||
| # try to grab from <a id="my_waste"> to <a id="my_parking"> (or my-councillors as fallback) | ||||||||||||||||||||
| m = re.search( | ||||||||||||||||||||
| r'<a\s+id=["\']my_waste["\']\s*></a>(.+?)(?:<a\s+id=["\']my_parking["\']|<a\s+id=["\']my_councillors["\'])', | ||||||||||||||||||||
| html, | ||||||||||||||||||||
| flags=re.I | re.S, | ||||||||||||||||||||
| ) | ||||||||||||||||||||
| input_element_postcodesearch.send_keys(street_name) | ||||||||||||||||||||
| if not m: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| return m.group(1) | ||||||||||||||||||||
|
|
||||||||||||||||||||
| # ----------------- small helpers ----------------- | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def _pid_from_url(self, url: str | None) -> str | None: | ||||||||||||||||||||
| if not url: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
| try: | ||||||||||||||||||||
| q = parse_qs(urlparse(url).query) | ||||||||||||||||||||
| return q.get("pid", [None])[0] | ||||||||||||||||||||
| except Exception: | ||||||||||||||||||||
| return None | ||||||||||||||||||||
|
|
||||||||||||||||||||
| def dismiss_cookie_banner(self, wait): | ||||||||||||||||||||
| cookie_banner = wait.until( | ||||||||||||||||||||
| EC.visibility_of_element_located((By.ID, "ccc-dismiss-button")) | ||||||||||||||||||||
| def _pid_from_paon(self, paon) -> str | None: | ||||||||||||||||||||
| # allow putting PID into "house number" | ||||||||||||||||||||
| if paon and str(paon).isdigit() and 10 <= len(str(paon)) <= 14: | ||||||||||||||||||||
| return str(paon) | ||||||||||||||||||||
|
Comment on lines
+137
to
+138
|
||||||||||||||||||||
| if paon and str(paon).isdigit() and 10 <= len(str(paon)) <= 14: | |
| return str(paon) | |
| # accept any non-empty, digit-only string (after stripping whitespace), | |
| # to support UPRNs/PIDs that may be shorter than 10 digits or include leading zeros | |
| if paon is None: | |
| return None | |
| paon_str = str(paon).strip() | |
| if paon_str and paon_str.isdigit(): | |
| return paon_str |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
datetime.strptime can raise ValueError on regex-matched but unparseable strings.
The regex \d{1,2}\s+[A-Za-z]+\s+\d{4} is intentionally loose — it will match text like "23 Octobar 2025" or "1 TBC 2025", which then causes datetime.strptime at line 161 to raise an unhandled ValueError, crashing the entire parse.
Wrap the strptime call in a try/except ValueError so parsing can continue to the next line or bin.
Proposed fix
for line in lines:
m = date_rx.search(line)
if m:
ds = m.group(0)
fmt = "%A %d %B %Y" if m.group(1) else "%d %B %Y"
- dt = datetime.strptime(ds, fmt)
- return dt.strftime(date_format)
+ try:
+ dt = datetime.strptime(ds, fmt)
+ return dt.strftime(date_format)
+ except ValueError:
+ continue # regex matched but date was not validBased on learnings, functions should prefer explicit failures with clear error causes over silently swallowed errors — here a continue with an optional log warning strikes the right balance between not crashing and not silently hiding the issue.
🤖 Prompt for AI Agents
In
`@uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py`
around lines 150 - 166, In _first_date_or_message, the regex date_rx can match
strings that datetime.strptime cannot parse, so wrap the datetime.strptime(ds,
fmt) call in a try/except ValueError block: catch ValueError, optionally log a
warning that ds failed to parse (include ds and fmt) and then continue to the
next line rather than letting the exception propagate; keep returning
dt.strftime(date_format) when parsing succeeds.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pid_from_urlis computed but never used, and the presence check uses a raw substring ("pid=" in base_url). This can lead to incorrect behavior (e.g., different casing, URL-encoded query, orpidpresent withoutpid=) and is also dead code. Use_pid_from_url(base_url)as the source of truth (and remove the unused variable), e.g. treatpid_from_urlas satisfying the “already has pid” case.