Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,100 +1,167 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# uk_bin_collection/uk_bin_collection/councils/richmond_gov_uk.py

import re
import html as html_unescape
from datetime import datetime
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from uk_bin_collection.uk_bin_collection.common import *
from urllib.parse import urlparse, parse_qs

import requests

from uk_bin_collection.uk_bin_collection.common import date_format
from uk_bin_collection.uk_bin_collection.get_bin_data import AbstractGetBinDataClass


class CouncilClass(AbstractGetBinDataClass):
"""
Richmond upon Thames – parse the static My Property page.
No Selenium. No BeautifulSoup. Just requests + regex tailored to the current markup.
"""

def parse_data(self, page: str, **kwargs) -> dict:
print(f"Arguments are f{kwargs}")
driver = None
try:
page = kwargs["url"]
street_name = kwargs.get("paon")
web_driver = kwargs.get("web_driver")
headless = kwargs.get("headless")

driver = create_webdriver(web_driver, headless, None, __name__)
driver.get(page)

wait = WebDriverWait(driver, 60)

self.dismiss_cookie_banner(wait)
self.input_street_name(street_name, wait)
self.submit(wait)
bin_types, collection_days = self.get_bins(driver)
bindata = self.get_collection_days(bin_types, collection_days)

print(bindata)

except Exception as e:
# Here you can log the exception if needed
print(f"An error occurred: {e}")
# Optionally, re-raise the exception if you want it to propagate
raise
finally:
# This block ensures that the driver is closed regardless of an exception
if driver:
driver.quit()
return bindata
base_url = kwargs.get("url") or page
pid_arg = kwargs.get("pid")
paon = kwargs.get("paon")

def get_collection_days(self, bin_types, collection_days):
bindata = {"bins": []}
WEEKLY_COLLECTION = 0
GARDEN_COLLECTION = 1
# work out final URL, but DO NOT add #my_waste
pid_from_url = self._pid_from_url(base_url)
pid_from_paon = self._pid_from_paon(paon)

for index, bin_type in enumerate(bin_types):
# currently only handled weekly and garden collection, special collections like Christmas Day need to be added
if index == WEEKLY_COLLECTION:
next_collection_date = get_next_day_of_week(
collection_days[index].text.strip(), date_format
)
elif index == GARDEN_COLLECTION:
split_date_part = collection_days[index].text.split("More dates")[0]
next_collection_date = datetime.strptime(
split_date_part.strip(), "%d %B %Y"
).strftime(date_format)
else:
next_collection_date = datetime.strptime(
collection_days[index].text.strip(), "%d %B %Y"
).strftime(date_format)

dict_data = {
"type": bin_type.text.strip(),
"collectionDate": next_collection_date,
}
bindata["bins"].append(dict_data)
if "pid=" in (base_url or ""):
target_url = base_url
elif pid_arg or pid_from_paon:
pid = pid_arg or pid_from_paon
sep = "&" if "?" in (base_url or "") else "?"
target_url = f"{base_url}{sep}pid={pid}"
Comment on lines +25 to +34
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pid_from_url is computed but never used, and the presence check uses a raw substring ("pid=" in base_url). This can lead to incorrect behavior (e.g., different casing, URL-encoded query, or pid present without pid=) and is also dead code. Use _pid_from_url(base_url) as the source of truth (and remove the unused variable), e.g. treat pid_from_url as satisfying the “already has pid” case.

Copilot uses AI. Check for mistakes.
else:
raise ValueError(
"Richmond: supply a URL that already has ?pid=... OR put PID in the House Number field."
)
Comment on lines +21 to +38
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

parse_data looks for kwargs["pid"], but the CLI/test harness only passes uprn (-u) and paon (-n) (see uk_bin_collection/uk_bin_collection/collect_data.py:104-107 and uk_bin_collection/tests/step_defs/test_validate_council.py:75-98). As a result, this council can’t be configured with UPRN via the existing uprn field and the current integration-test metadata for Richmond (house_number: "March Road") will now error out. Consider reading the identifier from kwargs.get("uprn") (and/or continuing to accept the old paon behavior), and update the error message to reference the actual supported inputs.

Copilot uses AI. Check for mistakes.

html = self._fetch_html(target_url)
bindata = self._parse_html_for_waste(html)
if not bindata["bins"]:
raise RuntimeError("Richmond: no bins found in page HTML.")
return bindata
Comment on lines 20 to 44
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

# First, let's examine the complete file to understand the class structure
fd -n "LondonBoroughOfRichmondUponThames.py" -x cat -n {}

Repository: robbrad/UKBinCollectionData

Length of output: 301


🏁 Script executed:

# Find the base class to understand get_and_parse_data
fd -n "\.py" -path "*/uk_bin_collection/*" | grep -i "base\|council" | head -20

Repository: robbrad/UKBinCollectionData

Length of output: 301


🏁 Script executed:

# Search for how skip_get_url is used across the codebase
rg -n "skip_get_url" --type py -B 2 -A 2

Repository: robbrad/UKBinCollectionData

Length of output: 15505


🏁 Script executed:

# Let's examine the Richmond Upon Thames file directly
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py

Repository: robbrad/UKBinCollectionData

Length of output: 6122


🏁 Script executed:

# Let's look at the base class get_and_parse_data method to understand the flow
cat uk_bin_collection/uk_bin_collection/get_bin_data.py | head -120

Repository: robbrad/UKBinCollectionData

Length of output: 3737


🏁 Script executed:

# Check if pid_from_url is used anywhere after line 26 in the Richmond file
rg "pid_from_url" uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py

Repository: robbrad/UKBinCollectionData

Length of output: 182


🏁 Script executed:

# Check if LondonBoroughOfRichmondUponThames.py exists as a separate file
fd "LondonBoroughOfRichmondUponThames" --type f

Repository: robbrad/UKBinCollectionData

Length of output: 152


🏁 Script executed:

# Also search for any Council files related to Richmond
fd "Richmond\|richmond" --type f uk_bin_collection/uk_bin_collection/councils/

Repository: robbrad/UKBinCollectionData

Length of output: 53


🏁 Script executed:

# Now let's check the actual LondonBoroughOfRichmondUponThames.py file
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py

Repository: robbrad/UKBinCollectionData

Length of output: 6122


🏁 Script executed:

# Verify the content matches what was shown in the review comment
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py | head -50

Repository: robbrad/UKBinCollectionData

Length of output: 1757


🏁 Script executed:

# Check what's actually in the LondonBoroughOfRichmondUponThames.py file
file uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py

Repository: robbrad/UKBinCollectionData

Length of output: 115


🏁 Script executed:

# Try to see if it's a symlink or something
ls -la uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py

Repository: robbrad/UKBinCollectionData

Length of output: 201


🏁 Script executed:

# Let's check the first few lines of both files to understand
echo "=== LondonBoroughOfRichmondUponThames.py ===" && head -5 uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py && echo "" && echo "=== richmond_gov_uk.py ===" && head -5 uk_bin_collection/uk_bin_collection/councils/richmond_gov_uk.py

Repository: robbrad/UKBinCollectionData

Length of output: 419


🏁 Script executed:

# Let's get the full content of the actual file
cat uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py

Repository: robbrad/UKBinCollectionData

Length of output: 6122


Bug: pid_from_url is computed but never used; double HTTP request.

Two issues in parse_data:

  1. pid_from_url (line 26) is assigned but never referenced — it looks like line 31 was intended to include it in the fallback chain (e.g., pid_arg or pid_from_url or pid_from_paon), but it was accidentally omitted. As-is, the only check for an existing PID in the URL is the fragile string test "pid=" in base_url at line 29. Since _pid_from_url properly parses the URL parameter, it should be used.

  2. Double HTTP request — the base class get_and_parse_data calls self.get_data(address_url) (which performs a GET), then passes the result as page to parse_data. This code ignores page and makes a second request via _fetch_html(target_url) at line 40. Either use the already-fetched page content or set skip_get_url in the kwargs to prevent the base class from making the first request.

Proposed fix
 def parse_data(self, page: str, **kwargs) -> dict:
     base_url = kwargs.get("url") or page
     pid_arg = kwargs.get("pid")
     paon = kwargs.get("paon")

     # work out final URL, but DO NOT add `#my_waste`
     pid_from_url = self._pid_from_url(base_url)
     pid_from_paon = self._pid_from_paon(paon)

     if "pid=" in (base_url or ""):
         target_url = base_url
-    elif pid_arg or pid_from_paon:
-        pid = pid_arg or pid_from_paon
+    elif pid_arg or pid_from_url or pid_from_paon:
+        pid = pid_arg or pid_from_url or pid_from_paon
         sep = "&" if "?" in (base_url or "") else "?"
         target_url = f"{base_url}{sep}pid={pid}"
     else:
         raise ValueError(
             "Richmond: supply a URL that already has ?pid=... OR put PID in the House Number field."
         )

To avoid the double request, set skip_get_url=True in your integration config to prevent the base class from making the initial request.

🧰 Tools
🪛 Ruff (0.14.14)

[error] 26-26: Local variable pid_from_url is assigned to but never used

Remove assignment to unused variable pid_from_url

(F841)


[warning] 36-38: Avoid specifying long messages outside the exception class

(TRY003)


[warning] 43-43: Avoid specifying long messages outside the exception class

(TRY003)

🤖 Prompt for AI Agents
In
`@uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py`
around lines 20 - 44, parse_data currently computes pid_from_url but never uses
it and always issues a second HTTP GET via _fetch_html instead of using the
already-fetched page; update parse_data to use pid_arg or pid_from_url or
pid_from_paon (i.e., pid = pid_arg or pid_from_url or pid_from_paon) when
building target_url and avoid the double request by using the provided page
content when it's already HTML (use page if present/truthy) and only call
_fetch_html(target_url) when page is empty or when skip_get_url is not set; also
ensure integration can set skip_get_url=True (get_and_parse_data/get_data) if
you prefer to prevent the base-class initial GET.


def get_bins(self, driver):
table = driver.find_element(By.XPATH, ".//div[@id='maincontent']//table")
table_rows = table.find_elements(by=By.TAG_NAME, value="tr")
headerRow = table_rows[0]
table_info_row = table_rows[1]
bin_types = headerRow.find_elements(by=By.TAG_NAME, value="th")[2:]
collection_days = table_info_row.find_elements(by=By.TAG_NAME, value="td")[2:]
return bin_types, collection_days

def submit(self, wait):
main_content_submit_button = wait.until(
EC.element_to_be_clickable(
(By.XPATH, ".//div[@id='maincontent']//input[@type='submit']")
# ----------------- HTTP -----------------

def _fetch_html(self, url: str) -> str:
headers = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
)
)
main_content_submit_button.send_keys(Keys.ENTER)
}
resp = requests.get(url, headers=headers, timeout=30)
resp.raise_for_status()
return resp.text

# ----------------- parsing (regex) -----------------

def _parse_html_for_waste(self, html: str) -> dict:
# isolate the waste block between <a id="my_waste"> and next section
waste_block = self._extract_waste_block(html)
if not waste_block:
return {"bins": []}

bins = []

def input_street_name(self, street_name, wait):
input_element_postcodesearch = wait.until(
EC.visibility_of_element_located((By.ID, "Street"))
# find all <h4>...</h4> in that block
for h_match in re.finditer(r"<h4>(.*?)</h4>", waste_block, flags=re.I | re.S):
bin_name = self._clean(h_match.group(1))
if not bin_name:
continue

# slice from end of this <h4> to either next <h4> or end of block
start = h_match.end()
# find next h4 after this one
next_h = re.search(r"<h4>", waste_block[start:], flags=re.I)
if next_h:
section = waste_block[start : start + next_h.start()]
else:
section = waste_block[start:]

# try to find <ul> ... <li>...</li> ... </ul>
date_lines = []
ul_match = re.search(r"<ul[^>]*>(.*?)</ul>", section, flags=re.I | re.S)
if ul_match:
ul_inner = ul_match.group(1)
for li in re.findall(r"<li[^>]*>(.*?)</li>", ul_inner, flags=re.I | re.S):
text = self._clean(li)
if text:
date_lines.append(text)

# fallback to <p>...</p>
if not date_lines:
p_match = re.search(r"<p[^>]*>(.*?)</p>", section, flags=re.I | re.S)
if p_match:
text = self._clean(p_match.group(1))
if text:
date_lines.append(text)

col_date = self._first_date_or_message(date_lines)
if col_date:
bins.append(
{
"type": bin_name,
"collectionDate": col_date,
}
)

return {"bins": bins}

def _extract_waste_block(self, html: str) -> str | None:
# try to grab from <a id="my_waste"> to <a id="my_parking"> (or my-councillors as fallback)
m = re.search(
r'<a\s+id=["\']my_waste["\']\s*></a>(.+?)(?:<a\s+id=["\']my_parking["\']|<a\s+id=["\']my_councillors["\'])',
html,
flags=re.I | re.S,
)
input_element_postcodesearch.send_keys(street_name)
if not m:
return None
return m.group(1)

# ----------------- small helpers -----------------

def _pid_from_url(self, url: str | None) -> str | None:
if not url:
return None
try:
q = parse_qs(urlparse(url).query)
return q.get("pid", [None])[0]
except Exception:
return None

def dismiss_cookie_banner(self, wait):
cookie_banner = wait.until(
EC.visibility_of_element_located((By.ID, "ccc-dismiss-button"))
def _pid_from_paon(self, paon) -> str | None:
# allow putting PID into "house number"
if paon and str(paon).isdigit() and 10 <= len(str(paon)) <= 14:
return str(paon)
Comment on lines +137 to +138
Copy link

Copilot AI Feb 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_pid_from_paon only accepts all-digit values with length 10–14. UPRNs are not guaranteed to be at least 10 digits (and may include leading zeros), so this can reject valid identifiers and force users into the unsupported pid kwarg. Consider accepting any non-empty digit string (possibly after stripping whitespace) or preferentially using kwargs.get("uprn") for UPRN while keeping this heuristic only as a fallback.

Suggested change
if paon and str(paon).isdigit() and 10 <= len(str(paon)) <= 14:
return str(paon)
# accept any non-empty, digit-only string (after stripping whitespace),
# to support UPRNs/PIDs that may be shorter than 10 digits or include leading zeros
if paon is None:
return None
paon_str = str(paon).strip()
if paon_str and paon_str.isdigit():
return paon_str

Copilot uses AI. Check for mistakes.
return None

def _clean(self, s: str) -> str:
# remove tags, unescape, strip
# first remove <br> and friends by replacing with space
s = re.sub(r"<br\s*/?>", " ", s, flags=re.I)
# strip any other simple tags
s = re.sub(r"<[^>]+>", "", s)
s = html_unescape.unescape(s)
return " ".join(s.split())

def _first_date_or_message(self, lines) -> str | None:
# match "Thursday 23 October 2025" or "23 October 2025"
date_rx = re.compile(
r"(?:(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s+)?"
r"(\d{1,2}\s+[A-Za-z]+\s+\d{4})"
)
cookie_banner.send_keys(Keys.ENTER)
for line in lines:
m = date_rx.search(line)
if m:
ds = m.group(0)
fmt = "%A %d %B %Y" if m.group(1) else "%d %B %Y"
dt = datetime.strptime(ds, fmt)
return dt.strftime(date_format)

lower = line.lower()
if "no collection" in lower or "no contract" in lower or "no subscription" in lower:
return line
Comment on lines +150 to +166
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

datetime.strptime can raise ValueError on regex-matched but unparseable strings.

The regex \d{1,2}\s+[A-Za-z]+\s+\d{4} is intentionally loose — it will match text like "23 Octobar 2025" or "1 TBC 2025", which then causes datetime.strptime at line 161 to raise an unhandled ValueError, crashing the entire parse.

Wrap the strptime call in a try/except ValueError so parsing can continue to the next line or bin.

Proposed fix
         for line in lines:
             m = date_rx.search(line)
             if m:
                 ds = m.group(0)
                 fmt = "%A %d %B %Y" if m.group(1) else "%d %B %Y"
-                dt = datetime.strptime(ds, fmt)
-                return dt.strftime(date_format)
+                try:
+                    dt = datetime.strptime(ds, fmt)
+                    return dt.strftime(date_format)
+                except ValueError:
+                    continue  # regex matched but date was not valid

Based on learnings, functions should prefer explicit failures with clear error causes over silently swallowed errors — here a continue with an optional log warning strikes the right balance between not crashing and not silently hiding the issue.

🤖 Prompt for AI Agents
In
`@uk_bin_collection/uk_bin_collection/councils/LondonBoroughOfRichmondUponThames.py`
around lines 150 - 166, In _first_date_or_message, the regex date_rx can match
strings that datetime.strptime cannot parse, so wrap the datetime.strptime(ds,
fmt) call in a try/except ValueError block: catch ValueError, optionally log a
warning that ds failed to parse (include ds and fmt) and then continue to the
next line rather than letting the exception propagate; keep returning
dt.strftime(date_format) when parsing succeeds.

return None
Loading