random/cc_scraper.py at main · jwseph/random · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd

# All 30 WA community-college institution codes
INSTITUTION_CODES = [
    "WA010","WA020","WA030","WA040","WA050","WA060","WA070","WA080","WA090",
    "WA100","WA110","WA120","WA130","WA140","WA150","WA160","WA170","WA180",
    "WA190","WA200","WA210","WA220","WA230","WA240","WA250","WA260","WA270",
    "WA280","WA290","WA300"
]

# Summer 2025 term code
TERM_CODE = "2255"

def scrape_institution(driver, inst_code):
    url = (
        "https://csprd.ctclink.us/psp/csprd/EMPLOYEE/SA/s/"
        "WEBLIB_HCX_CM.H_CLASS_SEARCH.FieldFormula.IScript_Main"
        f"?institution={inst_code}"
    )
    driver.get(url)
    time.sleep(2)

    # Select term dropdown and choose Summer 2025
    term_sel = driver.find_element(By.NAME, "CLASS_SRCH_WRK2_STRM$0")
    for opt in term_sel.find_elements(By.TAG_NAME, "option"):
        if opt.get_attribute("value") == TERM_CODE:
            opt.click()
            break

    # Submit search
    driver.find_element(By.NAME, "CLASS_SRCH_WRK2_SSR_PB_GO").click()
    time.sleep(5)

    # Parse the results table
    rows = driver.find_elements(
        By.XPATH,
        "//div[@id='win0divAGE_WRK_CLASS_SRCH_RSLT']//table//tr"
    )
    courses = []
    for row in rows[1:]:
        cells = row.find_elements(By.TAG_NAME, "td")
        if len(cells) < 4:
            continue
        courses.append({
            "Institution": inst_code,
            "CRN":        cells[0].text.strip(),
            "Subject":    cells[1].text.strip(),
            "Course":     cells[2].text.strip(),
            "Title":      cells[3].text.strip(),
            # add more fields as desired...
        })
    return courses

def main():
    chrome_opts = Options()
    chrome_opts.add_argument("--headless=new")  # headless mode :contentReference[oaicite:0]{index=0}
    driver = webdriver.Chrome(options=chrome_opts)  # Selenium WebDriver :contentReference[oaicite:1]{index=1}

    all_courses = []
    for code in INSTITUTION_CODES:
        print(f"Scraping {code}…")
        all_courses += scrape_institution(driver, code)

    driver.quit()

    # Save results
    df = pd.DataFrame(all_courses)
    df.to_csv("summer2025_wa_cc_courses.csv", index=False)
    print("Done: summer2025_wa_cc_courses.csv")

if __name__ == "__main__":
    main()