diff --git a/compass/_cli/process.py b/compass/_cli/process.py index 31712d5e..5a062c7f 100644 --- a/compass/_cli/process.py +++ b/compass/_cli/process.py @@ -2,8 +2,11 @@ import asyncio import logging +import shutil +import sys import warnings import multiprocessing +from pathlib import Path import click from rich.live import Live @@ -14,8 +17,11 @@ from compass.pb import COMPASS_PB from compass.plugin import create_schema_based_one_shot_extraction_plugin from compass.scripts.process import process_jurisdictions_with_openai -from compass.utilities.logs import AddLocationFilter from compass.utilities.io import load_config +from compass.utilities.logs import AddLocationFilter + + +OUT_DIR_POLICY_CHOICES = ["fail", "increment", "overwrite", "prompt"] @click.command @@ -49,10 +55,30 @@ default=None, help="One-shot plugin configuration to add to COMPASS before processing", ) -def process(config, verbose, no_progress, plugin): +@click.option( + "--out_dir_exists", + required=False, + default=None, + type=click.Choice(OUT_DIR_POLICY_CHOICES, case_sensitive=False), + help="How to handle an existing output directory." + " Choices: fail, increment, overwrite, prompt." + " If omitted, prompts interactively when running in a terminal," + " or fails when running non-interactively (e.g. CI).", +) +def process(config, verbose, no_progress, plugin, out_dir_exists): """Download and extract ordinances for a list of jurisdictions""" config = load_config(config) + if out_dir_exists is not None: + out_dir_policy = out_dir_exists + elif sys.stdin.isatty(): + out_dir_policy = "prompt" + else: + out_dir_policy = "fail" + config["out_dir"] = _resolve_out_dir_conflict( + config["out_dir"], out_dir_policy + ) + if plugin is not None: create_schema_based_one_shot_extraction_plugin( config=plugin, tech=config["tech"] @@ -128,3 +154,80 @@ def _setup_cli_logging(console, verbosity_level, log_level="INFO"): handler.addFilter(AddLocationFilter()) logger.addHandler(handler) logger.setLevel(log_level) + + +def _resolve_out_dir_conflict(out_dir, policy): + """Handle existing output directory using the selected policy""" + out_dir = Path(out_dir) + policy = policy.lower() + + if not out_dir.exists(): + return out_dir + + if policy == "fail": + return out_dir + + if policy == "increment": + new_out_dir = _next_versioned_directory(out_dir) + click.echo( + "Output directory exists. " + f"Using incremented directory: {new_out_dir!s}" + ) + return new_out_dir + + if policy == "overwrite": + click.echo(f"Overwriting existing output directory: {out_dir!s}") + shutil.rmtree(out_dir) + return out_dir + + if policy == "prompt": + if not sys.stdin.isatty(): + msg = ( + "Cannot use out_dir_exists='prompt' in non-interactive mode. " + "Use one of: fail, increment, overwrite." + ) + raise click.ClickException(msg) + + create_incremented = click.confirm( + f"Output directory '{out_dir!s}' already exists. " + "Create a new incremented directory automatically?", + default=True, + ) + if create_incremented: + new_out_dir = _next_versioned_directory(out_dir) + click.echo(f"Using incremented directory: {new_out_dir!s}") + return new_out_dir + + overwrite = click.confirm( + f"Overwrite '{out_dir!s}' by deleting it and continuing?", + default=False, + ) + if overwrite: + click.echo(f"Overwriting existing output directory: {out_dir!s}") + shutil.rmtree(out_dir) + return out_dir + + msg = ( + "Run cancelled. Please update out_dir in config, or rerun with " + "--out_dir_exists increment/overwrite." + ) + raise click.ClickException(msg) + + msg = ( + f"Unknown out_dir_exists policy '{policy}'. " + f"Supported values: {OUT_DIR_POLICY_CHOICES}." + ) + raise click.ClickException(msg) + + +def _next_versioned_directory(out_dir): + """ + Create the next available output directory suffix with + versioning + """ + idx = 2 + while True: + candidate = out_dir.parent / f"{out_dir.name}_v{idx}" + if not candidate.exists(): + return candidate + idx += 1 diff --git a/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml new file mode 100644 index 00000000..1cd642b1 --- /dev/null +++ b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml @@ -0,0 +1,170 @@ +schema: ./geothermal_schema.json + +data_type_short_desc: utility-scale geothermal electricity ordinance + +cache_llm_generated_content: true + +query_templates: + - "filetype:pdf {jurisdiction} geothermal power plant ordinance" + - "geothermal electricity generation ordinance {jurisdiction}" + - "{jurisdiction} geothermal energy facility zoning ordinance" + - "{jurisdiction} geothermal power plant land use code" + - "{jurisdiction} geothermal code of ordinances" + - "{jurisdiction} geothermal conditional use permit" + - "{jurisdiction} geothermal special use permit" + - "{jurisdiction} geothermal drilling permit regulations" + - "{jurisdiction} geothermal resource development statute" + - "Where can I find the legal text for geothermal power plant zoning ordinances in {jurisdiction}?" + - "What is the specific legal information regarding zoning ordinances for geothermal electricity generation facilities in {jurisdiction}?" + +website_keywords: + pdf: 92160 + geothermal: 46080 + ordinance: 23040 + zoning: 11520 + regulation: 5760 + code: 2880 + power: 1440 + electricity: 1440 + planning: 720 + permit: 720 + land use: 720 + municipal: 720 + county: 360 + ordinance code: 360 + code of ordinances: 360 + land use code: 360 + use table: 360 + chapter: 180 + article: 180 + title: 180 + statute: 180 + administrative code: 180 + conditional use permit: 180 + special use permit: 180 + drilling permit: 180 + resource development: 180 + government: 180 + +heuristic_keywords: + good_tech_keywords: + - "wellfield" + - "well field" + - "production well" + - "geothermal exploration" + - "geothermal generating" + - "geothermal generation" + - "geothermal power" + - "geothermal production" + - "geothermal project" + - "geothermal overlay zone" + - "geothermal power plant" + - "geothermal facility" + - "geothermal electric" + - "geothermal energy facility" + - "steam turbine" + - "binary cycle" + - "flash steam" + - "dry steam" + - "enhanced geothermal" + - "reservoir temperature" + - "brine" + - "reinjection well" + - "production zone" + - "geothermal resource" + - "geothermal production project" + - "geothermal drilling" + - "exploratory well" + - "injection well" + - "geothermal lease" + - "drilling permit" + - "plan of utilization" + - "known geothermal resource" + - "geothermal development" + - "geothermal well" + - "geothermal reservoir" + - "geothermal permit" + - "geothermal ordinance" + - "geothermal zoning" + - "code of ordinances" + - "land use code" + - "use table" + - "zoning ordinance" + - "special use permit" + - "conditional use permit" + good_tech_acronyms: + - "egs" + - "kgra" + good_tech_phrases: + - "geothermal power plant" + - "geothermal electricity generation" + - "geothermal energy facility" + - "geothermal resource development" + - "known geothermal resource area" + - "binary cycle" + - "flash steam" + - "dry steam" + - "steam turbine" + - "plan of utilization" + - "land use code" + - "code of ordinances" + - "zoning ordinance" + - "special use permit" + - "conditional use permit" + not_tech_words: + - "geothermal heat pump" + - "ground source heat pump" + - "ground-source heat pump" + - "ghp" + - "ground heat pump" + - "gshp" + - "ground-coupled heat pump" + - "ground coupled heat pump" + - "earth-coupled heat pump" + - "earth-source heat pump" + - "geoexchange" + - "geo-exchange" + - "closed loop" + - "closed-loop" + - "open loop" + - "vertical loop" + - "horizontal loop" + - "heating and cooling" + - "hvac" + - "space heating" + - "water heating" + - "direct use" + - "direct-use" + - "district heating" + - "greenhouse heating" + - "residential geothermal" + - "accessory use" + - "energy star" + - "solar panel" + - "solar array" + - "solar farm" + - "solar energy system" + - "solar energy facility" + - "photovoltaic" + - "net metering" + - "solar collector" + - "solar ordinance" + - "wind energy" + - "wind farm" + - "wind turbine" + - "wind energy system" + - "wind energy facility" + - "wind energy conversion" + - "wind ordinance" + - "anemometer tower" + - "meteorological tower" + - "rotor diameter" + - "tip height" + - "nacelle" + - "battery storage" + - "energy storage system" + - "hydroelectric" + - "biomass" + - "cannabis" + - "cannabis cultivation" + - "commercial cannabis" \ No newline at end of file diff --git a/compass/extraction/geothermal_electricity/geothermal_schema.json b/compass/extraction/geothermal_electricity/geothermal_schema.json new file mode 100644 index 00000000..2b30883c --- /dev/null +++ b/compass/extraction/geothermal_electricity/geothermal_schema.json @@ -0,0 +1,383 @@ +{ + "title": "Geothermal Electricity Ordinance Extraction Schema", + "description": "Single-shot structured extraction schema for utility-scale geothermal electricity ordinances. This schema guides an LLM to extract all relevant features in one call and returns an outputs array where each object represents one row in the extracted long-form table.", + "version": "2.1.2", + "type": "object", + "required": ["outputs"], + "additionalProperties": false, + "properties": { + "outputs": { + "type": "array", + "description": "Sparse long-form extraction table. Include only features with an enacted, explicit requirement and emit at most one row per feature. Never infer, imply, or guess a requirement from related context.", + "items": { + "type": "object", + "required": [ + "feature", + "value", + "units", + "section", + "summary", + "explanation" + ], + "additionalProperties": false, + "properties": { + "feature": { + "type": "string", + "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs exactly as written. Do not invent aliases, prefixes, or synonym variants; for example, use 'residential zones distance' and never 'structures residential zones distance'.", + "enum": [ + "residential zones distance", + "property lines distance", + "roads distance", + "railroads distance", + "existing transmission lines distance", + "water bodies distance", + "combustible tanks distance", + "domestic wells distance", + "active faults distance", + "schools distance", + "hospitals distance", + "drilling start time", + "drilling end time", + "noise", + "maximum height", + "minimum lot size", + "fencing", + "color requirements", + "lighting requirements", + "visual impact assessment", + "seismic monitoring plan", + "primary use districts", + "special use districts", + "accessory use districts", + "prohibited use districts", + "required permits", + "bond requirement", + "decommissioning", + "prohibitions" + ] + }, + "value": { + "description": "The extracted ordinance value. For numerical distance thresholds and limits, use a number. For permit or district lists, use an array of strings. For drilling start and end time features, use a string in 24-hour HH:MM format. For date language or other categorical outcomes, use a string. Use null only for qualitative features, and only when an enacted, explicit, enforceable ordinance requirement for that feature is present. Null must never be used to indicate absence. If a feature has no enacted, explicit requirement in the ordinance text, omit that feature from outputs.", + "anyOf": [ + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + }, + { + "type": "null" + } + ] + }, + "units": { + "type": ["string", "null"], + "description": "Units for the extracted value. Use canonical units for interoperability. For distance and height features, use 'feet' or 'meters'. For minimum lot size, use 'acres' or 'square feet'. For noise, normalize A-weighted variants such as 'dB(A)' or 'dBA' to 'dBA'; keep plain 'dB' only when the text is explicitly not A-weighted. For drilling start and drilling end features, use 'HH:MM (24-hour)'. Preserve verbatim ordinance wording in summary while keeping units standardized in this field. Use null for district lists, permit lists, dates, and qualitative requirements without measurable units." + }, + "section": { + "type": ["string", "null"], + "description": "The section title or number where the requirement appears. Include labels or numbers if available. Null if no section identifier is available." + }, + "summary": { + "type": "string", + "description": "A short summary with direct ordinance excerpts or quotes whenever possible. For qualitative features such as permitting, fencing, lighting, seismic monitoring, decommissioning, and prohibitions, this is the primary output field and should contain direct ordinance language. For numeric features, summary must support the same requirement used to extract value and units. Must be a non-null, non-empty string. Do not output absence placeholders such as 'No explicit requirement found'; omit the feature instead when no requirement is present." + }, + "explanation": { + "type": "string", + "description": "Brief rationale explaining why this row matches the selected feature under this schema. Reference the specific evidence in summary and how it supports the extracted value and units or, for qualitative features, the inclusion criteria. Must be a non-null, non-empty string and must not use absence placeholders." + } + } + } + } + }, + "$core_principles": { + "scope_context": { + "description": "Only extract requirements that apply to utility-scale geothermal electricity generation and directly associated geothermal electricity infrastructure, including exploration or drilling operations, production or injection wells, geothermal power plants, associated substations, and gen-tie lines only when the ordinance text explicitly governs them. Exclude geothermal heat pumps, HVAC, direct-use geothermal, district heating, greenhouse heating, residential geothermal, and other non-generation systems unless the ordinance explicitly governs utility-scale electricity generation as well. State statutes and regulations are in scope when they impose enforceable siting, zoning, drilling, permitting, monitoring, bonding, setback, or decommissioning requirements on geothermal electricity projects in the jurisdiction. This schema must work across nationwide ordinance styles, including county, municipal, township, parish, borough, tribal, and state-level regulatory text." + }, + "technology_applicability_gate": { + "description": "Extract a row only when the cited evidence clearly applies to geothermal electricity. If the excerpt does not explicitly mention geothermal, the summary must include clear evidence that the governing section or table applies to geothermal electricity projects (for example a geothermal-specific chapter heading, geothermal-defined use class, or explicit cross-reference in the same provided text). If applicability is ambiguous or technology-neutral without clear geothermal linkage, omit the feature." + }, + "nationwide_jurisdiction_handling": { + "description": "Preserve the ordinance's own governance vocabulary instead of normalizing it to one jurisdiction type. Districts may appear as zoning districts, use districts, overlays, resource areas, exclusive farm use zones, planned development areas, or similar land-use categories. Permits may appear as conditional use permits, special use permits, administrative permits, use permits, site certificates, plans of operation, drilling authorizations, county approvals, state board approvals, or similar authorizations. If a jurisdiction is effectively unzoned or the controlling requirements come from state regulation without district tables, omit district features rather than forcing a district classification." + }, + "strict_evidence_gate": { + "description": "Extract a feature only when the ordinance text explicitly states a requirement, permission, district allowance, prohibition, date, or other operative rule for that same feature. Never infer, assume, extrapolate, or guess from related context, implications, headings, or nearby provisions. Tables, matrices, footnotes, appendices, and labeled exhibits count when they state the operative requirement. If the ordinance points to another chapter, outside document, or technical standard without restating the controlling requirement in the provided text, omit the feature instead of emitting a blank, placeholder, or inferred value." + }, + "data_omission": { + "description": "Emit only positively matched features. If a feature is not explicitly present, omit it entirely rather than returning placeholder text. For qualitative features, use value=null and units=null only when an enacted, explicit requirement for that same feature is present. For numeric features, extract only when an explicit numeric threshold is stated in the ordinance text; otherwise omit the feature instead of returning null, empty, or qualitative-only values. Never emit absence placeholders such as 'not found', 'no explicit requirement', 'none', or similar text in any field." + }, + "numeric_prioritization": { + "description": "When multiple numeric values apply to the same feature, keep one row and select the controlling most restrictive value for that feature. Restrictiveness rules: distance features choose the largest minimum separation distance; noise choose the lowest allowed numeric limit; maximum height choose the lowest maximum height; minimum lot size choose the highest minimum lot size. Keep condition-specific alternatives in summary only when the ordinance text explicitly shows they all apply to the same geothermal electricity feature." + }, + "district_and_permit_routing": { + "description": "Use the district features only for zoning districts, overlay districts, or land-use districts that explicitly classify geothermal electricity facilities. Use 'primary use districts' when the use is allowed by right or as a principal permitted use. Use 'special use districts' when the use requires a conditional use permit, special use permit, or comparable discretionary approval. Use 'accessory use districts' only when the ordinance explicitly allows geothermal electricity as an accessory or secondary use. Use 'prohibited use districts' only for districts where geothermal electricity facilities are explicitly and unconditionally prohibited. Use 'required permits' for explicit permit, approval, site-plan, zoning-permit, county-permit, drilling-permit, or similar authorization requirements that must be filed to initiate geothermal exploration, drilling, construction, or operation. Do not route permit requirements into district features, and do not route district tables into permit features." + }, + "prohibition_boundary": { + "description": "Use 'prohibitions' only for currently effective bans or moratoria on geothermal electricity exploration, drilling, well development, power plant siting, facility construction, or related project deployment. A ban on hydraulic fracturing or fracking counts only when the ordinance explicitly uses that ban to prohibit, limit, or condition geothermal electricity development. Do not treat ordinary permit conditions, environmental standards, or operational restrictions as prohibitions when the project remains allowable subject to compliance." + } + }, + "$definitions": { + "setback_features": { + "description": "Setback features for geothermal electricity facilities and related infrastructure. Treat each setback feature independently and do not cross-apply a setback unless the ordinance text explicitly states that it applies to multiple target types. When a single clause explicitly lists multiple target types and one shared numeric setback, emit one row per explicitly listed feature using the same numeric value and units and cite the same clause in summary. Apply the shared numeric prioritization rules in $core_principles when multiple numeric values explicitly apply to the same feature.", + "properties": { + "residential zones distance": { + "description": "Minimum required separation from structures, dwellings, occupied buildings, residences, homes, residential receptors, residential uses, or residential zoning districts. Extract this feature only when the ordinance explicitly ties the setback to structures, residences, occupied buildings, or residential zones. If one clause applies a common setback to multiple structure-like receptors such as homes, occupied buildings, and residential districts, keep one row under this feature and preserve the exact receptor list in summary. Do not map generic property line or district boundary setbacks here unless the text explicitly names structures or residential zones." + }, + "property lines distance": { + "description": "Minimum required separation from property lines, lot lines, parcel boundaries, or lease boundaries when the ordinance explicitly states the distance is measured from that boundary. Do not remap property-line distances to roads, transmission lines, or residential zones unless the text explicitly makes them equivalent for that requirement. Distances to official plan lines or specific plan lines for public highways do not belong here unless the ordinance expressly defines those lines as property boundaries for the same requirement." + }, + "roads distance": { + "description": "Minimum required separation from public roads, road rights-of-way, streets, highways, named roadway corridors, official plan lines, specific plan lines for highways, or similar transportation corridors explicitly framed as roads. Property-line distances do not count for this feature unless the ordinance text explicitly states that the property line is the road right-of-way or otherwise makes them the same boundary for that requirement." + }, + "railroads distance": { + "description": "Minimum required separation from railroads, railroad rights-of-way, rail corridors, or active rail lines. Extract only when rail infrastructure is explicitly named." + }, + "existing transmission lines distance": { + "description": "Minimum required separation from existing transmission lines, transmission corridors, substations, or other existing electric transmission infrastructure when explicitly named. Use this feature for geothermal generation or gen-tie line setbacks from existing transmission assets only when the ordinance text expressly states the setback." + }, + "water bodies distance": { + "description": "Minimum required separation from rivers, streams, lakes, ponds, wetlands, reservoirs, shorelines, floodplains, springs, or other water bodies explicitly named in the ordinance. Do not map domestic water well setbacks here." + }, + "combustible tanks distance": { + "description": "Minimum required separation from combustible tanks, fuel tanks, flammable storage tanks, petroleum tanks, or similar combustible storage infrastructure. Extract only when that storage infrastructure is explicitly named." + }, + "domestic wells distance": { + "description": "Minimum required separation from domestic wells, private wells, drinking water wells, household wells, or other non-production water supply wells explicitly identified as domestic or private. Do not map production, injection, monitoring, or geothermal wells here." + }, + "active faults distance": { + "description": "Minimum required separation from active faults, known faults, fault traces, seismic hazard zones, or similar geologic fault features when explicitly named as a setback or exclusion distance." + }, + "schools distance": { + "description": "Minimum required separation from schools, school properties, school buildings, educational campuses, or similar school uses explicitly named in the ordinance." + }, + "hospitals distance": { + "description": "Minimum required separation from hospitals, medical centers, clinics, nursing facilities, or similar health-care institutions explicitly named in the ordinance." + } + } + }, + "numerical_features": { + "description": "Non-setback numerical restriction features. Only extract if numerical values are explicitly given in the ordinance text.", + "properties": { + "noise": { + "description": "Extract maximum allowable operational noise for geothermal electricity facilities only when an explicit numeric limit is stated. Normalize A-weighted units to 'dBA' in units and preserve verbatim wording in summary. If the ordinance only references compliance with external standards or provides no numeric noise limit, omit this feature entirely." + }, + "maximum height": { + "description": "Extract maximum allowed structure, drill rig, stack, tower, cooling equipment, or facility height only when an explicit numeric cap is stated. If multiple height caps apply to the same geothermal electricity feature, keep the lowest maximum and list the alternatives in summary." + }, + "minimum lot size": { + "description": "Extract the minimum lot, parcel, tract, or site area required for geothermal electricity facilities only when an explicit numeric minimum is stated. If multiple minimum sizes apply, keep the highest minimum and list condition-specific alternatives in summary." + } + } + }, + "time_window_features": { + "description": "Time-of-day or day-of-week operational schedule restrictions.", + "properties": { + "drilling start time": { + "description": "Extract the earliest allowed geothermal drilling start time in 24-hour HH:MM format from explicit ordinance drilling-hour requirements. Normalize times like 7 a.m. to 07:00. When the ordinance states prohibited windows, convert to the corresponding allowed start time if the allowed window is explicit. If the ordinance contains both a broad 24-hour allowance and a narrower recurring non-emergency window for drilling or drilling-related activities (for example site preparation, drill-pipe handling, well workover, or similar drilling-stage tasks), use the narrower recurring window as the controlling schedule and describe the 24-hour exception in summary. Use units 'HH:MM (24-hour)'. Do not infer drilling schedule limits from general noise rules unless the ordinance explicitly sets drilling-hour restrictions." + }, + "drilling end time": { + "description": "Extract the latest allowed geothermal drilling end time in 24-hour HH:MM format from explicit ordinance drilling-hour requirements. Normalize times like 7 p.m. to 19:00. When the ordinance allows 24-hour drilling, use 24:00 as the end time and 00:00 as the start time only if no narrower recurring non-emergency drilling window is also stated. If both are present, use the narrower recurring window and capture the 24-hour exception in summary. Use units 'HH:MM (24-hour)'. Do not infer drilling schedule limits from general noise rules unless the ordinance explicitly sets drilling-hour restrictions." + } + } + }, + "qualitative_features": { + "description": "Operational, design, permit, monitoring, and end-of-life requirements that are primarily textual obligations.", + "properties": { + "fencing": { + "description": "Extract explicit fencing, enclosure, controlled-access, or perimeter barrier requirements for geothermal electricity facilities, wells, plants, or related equipment. Do not map generic security plans here unless the ordinance explicitly requires fencing or enclosure." + }, + "color requirements": { + "description": "Extract explicit color, paint, finish, or color-treatment requirements for geothermal electricity structures or equipment. Do not map generic visual mitigation here unless the ordinance expressly requires a color treatment." + }, + "lighting requirements": { + "description": "Extract explicit lighting requirements, shielding, directionality, glare controls, night-lighting limits, or aviation-lighting conditions for geothermal electricity facilities or related equipment." + }, + "visual impact assessment": { + "description": "Extract explicit requirements for a visual impact assessment, visual resource study, scenic impact analysis, photo simulation, line-of-sight analysis, or similar visual review document. Do not use this feature for ordinary screening requirements unless the ordinance explicitly requires an assessment or study." + }, + "seismic monitoring plan": { + "description": "Extract explicit requirements for a seismic monitoring plan, induced seismicity monitoring plan, geophysical monitoring plan, or similar seismicity-management document or program. Do not use this feature for generic environmental monitoring unless the ordinance explicitly ties it to seismic or fault-related monitoring." + }, + "required permits": { + "description": "Extract explicit permits, approvals, authorizations, reviews, certifications, or comparable entitlements that must be filed or obtained to initiate geothermal exploration, drilling, facility construction, or facility operation. This includes conditional use permits, special use permits, zoning permits, drilling permits, county permits, municipal permits, use permits, site plan approvals, plans of operation, state board approvals, siting certificates, and similar authorizations. Preserve exact permit names in value. Always output value as an array of strings, even when only one permit is required. Exclude purely procedural pathways and non-filing actions such as appeals, interpretations, lot line adjustments, variances not required for geothermal initiation, temporary-event permits, or unrelated permits unless the ordinance explicitly makes them mandatory for geothermal project initiation." + }, + "bond requirement": { + "description": "Extract explicit bonding, financial assurance, surety, letter-of-credit, security, escrow, reclamation guarantee, decommissioning guarantee, or similar assurance requirements. Use value=null and units=null unless the ordinance states a specific numeric amount or formula that should be preserved in summary. If the ordinance uses a formula, engineer estimate, inflation adjustment, or agency-determined amount instead of a fixed number, keep that logic in summary rather than forcing a numeric value." + }, + "decommissioning": { + "description": "Extract requirements for decommissioning, abandonment, plugging, removal, reclamation, site restoration, salvage, or retirement of geothermal electricity facilities, wells, plants, or associated infrastructure, including responsible party and timeline details when explicitly stated." + } + } + }, + "district_features": { + "description": "Zoning, overlay, and land-use district allowances for geothermal electricity facilities.", + "properties": { + "primary use districts": { + "description": "Extract all districts, zones, overlays, resource areas, or similar land-use categories where geothermal electricity facilities are explicitly allowed by right, as a principal permitted use, or under an overlay that functions as a primary-use authorization. Use an array of district names in value. Preserve exact district names or codes and describe by-right allowance in summary. If the ordinance classifies geothermal as conditional, special, discretionary, exception, or permit-only, do not use this feature; use 'special use districts' instead." + }, + "special use districts": { + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are allowed only through special use, conditional use, special exception, discretionary review, or comparable approval. Use an array of district names in value and preserve the approval posture in summary. Do not route the permit type itself here unless it is part of the district allowance language. If the ordinance states geothermal is conditional in a district, never classify that district as primary use." + }, + "accessory use districts": { + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are explicitly allowed only as an accessory, incidental, or subordinate use to another principal use. Use an array of district names in value and preserve the accessory-use condition in summary." + }, + "prohibited use districts": { + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are explicitly and unconditionally prohibited. Use an array of district names in value and quote the prohibitory language in summary. Do not include districts where geothermal electricity may still be allowed with a permit, variance, overlay, or other conditional path." + } + } + }, + "prohibition_features": { + "description": "Prohibitions, bans, or moratoria on geothermal electricity exploration, drilling, facility siting, or deployment.", + "properties": { + "prohibitions": { + "description": "Extract currently effective bans, moratoria, or explicit prohibitions on geothermal electricity exploration, drilling, well development, plant construction, or facility siting. Include fracking or hydraulic-fracturing bans only when the ordinance explicitly uses them to ban, limit, or condition geothermal electricity activity. If there are carve-outs, exceptions, or conditional permitting paths that still allow the project, do not classify the rule as a prohibition." + } + } + } + }, + "$examples": [ + { + "outputs": [ + { + "feature": "property lines distance", + "value": 500, + "units": "feet", + "section": "Section 8.4 - Setbacks", + "summary": "'Geothermal production wells and associated facilities shall be set back at least 500 feet from all property lines.'", + "explanation": "The excerpt states an explicit numeric minimum separation from property lines, so it maps directly to 'property lines distance' with value 500 feet." + }, + { + "feature": "special use districts", + "value": [ + "Industrial", + "Agricultural" + ], + "units": null, + "section": "Table 4 - Allowed Uses", + "summary": "'Geothermal power plants may be approved as conditional uses in the Industrial and Agricultural districts subject to county review.'", + "explanation": "The ordinance explicitly lists two districts where geothermal power plants are allowed only through conditional-use approval, so this belongs under 'special use districts' with the district names preserved as an array." + }, + { + "feature": "required permits", + "value": [ + "conditional use permit", + "county drilling permit" + ], + "units": null, + "section": "Section 5.2 - Approval Process", + "summary": "'A conditional use permit and county drilling permit shall be obtained prior to the construction or operation of any geothermal power plant or exploratory well.'", + "explanation": "The clause explicitly requires two project approvals before geothermal development can start, so it belongs under 'required permits' with both permit names preserved as an array." + }, + { + "feature": "drilling start time", + "value": "07:00", + "units": "HH:MM (24-hour)", + "section": "Section 6.7 - Drilling Operations", + "summary": "'Routine geothermal drilling activities may occur only between 7:00 a.m. and 7:00 p.m., Monday through Saturday, and shall not occur on Sundays or legal holidays except in an emergency.'", + "explanation": "The ordinance gives an explicit drilling window beginning at 7:00 a.m., normalized to 24-hour time as 07:00." + }, + { + "feature": "drilling end time", + "value": "19:00", + "units": "HH:MM (24-hour)", + "section": "Section 6.7 - Drilling Operations", + "summary": "'Routine geothermal drilling activities may occur only between 7:00 a.m. and 7:00 p.m., Monday through Saturday, and shall not occur on Sundays or legal holidays except in an emergency.'", + "explanation": "The ordinance gives an explicit drilling window ending at 7:00 p.m., normalized to 24-hour time as 19:00." + }, + { + "feature": "bond requirement", + "value": null, + "units": null, + "section": "Section 9.3 - Financial Assurance", + "summary": "'Prior to permit issuance, the operator shall provide financial assurance in a form acceptable to the state oil, gas, and geothermal agency in an amount sufficient to cover plugging, abandonment, reclamation, and decommissioning costs as determined by the agency engineer.'", + "explanation": "The clause imposes an enforceable financial assurance requirement but leaves the amount to an agency-determined formula, so it fits 'bond requirement' with value and units set to null." + } + ] + } + ], + "$instructions": { + "general": [ + "Use direct text excerpts and quotes in summary whenever possible.", + "Each feature may appear at most once in outputs; do not emit multiple rows for the same feature. If multiple ordinance lines map to one feature, build a temporary map keyed by feature, aggregate all evidence clauses under that feature key, consolidate into one row, and keep the controlling most restrictive value in value while listing alternatives in summary.", + "Feature IDs are strict canonical keys. Do not output aliases, prefixed variants, or paraphrased feature names not present in the enum.", + "For any numeric feature, the summary must support the same requirement that produced value and units for that row. Never pair a numeric value from one clause with qualitative-only language from another clause that has no numeric threshold.", + "Standardize units in the units field using this schema's canonical vocabulary, while preserving ordinance-specific wording in summary.", + "Summary is the primary data carrier for all features in this schema; every row must have a non-null, non-empty string for summary.", + "Every row must include an explanation that briefly justifies why the cited summary evidence matches the selected feature under this schema's rules.", + "Emit only positively matched features. Never emit a row to explain why a feature does not apply.", + "The outputs array is a sparse long-form extraction table and does not need to contain every enumerated feature.", + "Tables, table footnotes, and labeled graphics count as valid ordinance evidence when they state the controlling requirement; preserve the relevant table cell or footnote context in summary.", + "Preserve exact local or state regulatory terminology in summary and, where applicable, in value. Do not rename district categories, permit names, or agency approvals into a preferred local template.", + "If ordinance text shows an amended or superseding requirement, extract the current operative requirement as written rather than a superseded historical value unless the ordinance text clearly keeps both rules active.", + "If text is suggestive but not explicit for the target feature, omit the feature.", + "If the text references a different chapter or external document for the controlling value but does not restate that value here, omit the feature instead of outputting blanks or placeholders.", + "If a provision is written for renewable or energy facilities generally, extract it only when the same provided evidence clearly ties that provision to geothermal electricity." + ], + "setbacks": [ + "Setbacks should be extracted as minimum separation distances.", + "Prefer numeric values with units such as 'feet' or 'meters'.", + "Setback rows must contain numeric value and non-null units; never emit qualitative-only setback rows.", + "If both general and condition-specific setbacks are provided, select the controlling most restrictive value for the geothermal electricity scenario and describe conditions in summary.", + "Do not infer one setback feature from another. A property-line setback is not a structures setback, and a roads setback is not a railroad setback, unless the ordinance text explicitly says so.", + "When one setback clause explicitly names multiple target features and provides one shared numeric threshold, emit one row per explicitly named feature using the same threshold and units.", + "Treat distances to official plan lines or specific plan lines for public highways as roads distance unless the ordinance explicitly defines them as property boundaries for the same requirement." + ], + "numerical": [ + "Numerical features in this schema are the eleven distance features plus noise, maximum height, and minimum lot size.", + "For noise, maximum height, and minimum lot size, extract only explicit numeric thresholds. If the ordinance gives only narrative standards or references other codes without restating the threshold, omit the feature.", + "For drilling schedule requirements, extract explicit start and end times into 'drilling start time' and 'drilling end time' using 24-hour HH:MM format and units 'HH:MM (24-hour)'." + ], + "qualitative": [ + "For qualitative features, output only when an explicit enforceable requirement is present.", + "For fencing, color requirements, lighting requirements, visual impact assessment, seismic monitoring plan, bond requirement, and decommissioning, prefer value=null and units=null unless the ordinance states a specific numeric threshold or an explicit list that should be preserved in value.", + "For required permits, always use an array of strings in value, even when only one permit is required.", + "For drilling start and end times, if both a broad 24-hour allowance and a narrower recurring non-emergency drilling window are present, extract the narrower recurring window and mention the 24-hour exception in summary.", + "For bond requirement, preserve formulas, engineer estimates, inflation adjustments, agency-set amounts, and similar non-fixed sizing logic in summary instead of forcing a numeric amount.", + "Do not map generic application materials, narrative findings, or descriptive recitals into these features unless the ordinance explicitly makes them enforceable requirements." + ], + "districts": [ + "For all district features, use an array of district or zone names and set units to null.", + "Use the exact district names or codes as they appear in the ordinance text whenever possible.", + "Use 'primary use districts' for by-right or principal-use authorization, 'special use districts' for conditional or discretionary authorization, 'accessory use districts' for accessory-only authorization, and 'prohibited use districts' for unconditional district-level bans.", + "Preserve the legal approval posture in summary, but keep only the district names in value.", + "If the jurisdiction is unzoned, statewide, or otherwise does not use district-style land-use categories for the operative geothermal rule, omit district features rather than inventing a district mapping.", + "If the ordinance does not explicitly list district names, omit the feature rather than paraphrasing a generic zoning statement." + ], + "prohibitions": [ + "Classify prohibitions as currently effective bans or moratoria on geothermal electricity exploration, drilling, well development, facility siting, or deployment.", + "If no active prohibition is found, omit the feature rather than using placeholder values.", + "Distinguish between complete prohibition and conditional permitting. Conditional permitting is not a ban.", + "Do not treat ordinary operational, environmental, design, monitoring, or permit conditions as prohibitions when the ordinance still allows the project to proceed subject to compliance.", + "A fracking ban belongs here only when the ordinance explicitly uses it to regulate geothermal electricity development." + ] + }, + "$qa_checklist": [ + "Enforce uniqueness by feature with len(outputs) == len(unique(feature values)); if duplicates exist, merge or drop invalid rows until equality is true.", + "Every row must have non-null, non-empty strings for summary and explanation.", + "Explanation must explicitly tie summary evidence to the selected feature and must not contradict feature inclusion or exclusion criteria.", + "For every numeric feature row, require numeric value and non-null units.", + "For 'required permits', require value to be a non-empty array of strings and units to be null.", + "For drilling schedule rows, require both 'drilling start time' and 'drilling end time' when the ordinance states an explicit allowed window.", + "For drilling schedule rows with both 24-hour and narrower recurring windows, keep the narrower recurring window values and retain 24-hour language in summary only as an exception.", + "Reject any district row where summary language indicates conditional, special, discretionary, or permit-only approval but feature is 'primary use districts'.", + "Remove any numeric-feature row derived only from qualitative language when no numeric threshold is quoted.", + "If summary or explanation indicates the feature is not applicable, omit the row.", + "If a feature fails any check, omit it rather than returning a partial row." + ], + "$qualitative_features": [ + "fencing", + "color requirements", + "lighting requirements", + "visual impact assessment", + "seismic monitoring plan", + "bond requirement", + "decommissioning", + "prohibitions" + ] +} diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index 1887de03..6141b153 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -3,6 +3,7 @@ import json import asyncio import logging +import re from datetime import datetime from abc import ABC, abstractmethod @@ -214,7 +215,6 @@ def _store_chunk(self, parser, chunk_ind): ind_to_grab = chunk_ind + offset if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks): continue - self._chunks.setdefault( ind_to_grab, parser.text_chunks[ind_to_grab] ) @@ -296,11 +296,18 @@ async def _process(self, text_chunks): return text_summary +# Constants for magic values +_MAGIC_NEIGHBOR_CHUNK_COUNT = 2 +_MAGIC_HOUR_12 = 12 +_MAGIC_MINUTE_59 = 59 + + class SchemaOrdinanceParser(SchemaOutputLLMCaller, BaseParser): """Base class for parsing structured data""" DATA_TYPE_SHORT_DESC = None - """Optional short description of the type of data being extracted + """ + Optional short description of the type of data being extracted Examples -------- @@ -389,6 +396,8 @@ async def parse(self, text): def _to_dataframe(self, data): """Convert LLM output to a DataFrame""" + data = self._normalize_outputs(data) + output_items = self.SCHEMA["properties"]["outputs"]["items"] all_features = output_items["properties"]["feature"]["enum"] @@ -413,3 +422,108 @@ def _to_dataframe(self, data): ] out_cols = [col for col in possible_out_cols if col in full_df.columns] return full_df[["feature", *out_cols, "quantitative"]] + + def _normalize_outputs(self, data): + """Normalize selected feature payloads for stable CSV outputs""" + + rules = self.SCHEMA.get("$postprocess_rules") or {} + pipeline = rules.get("pipeline") or [] + if not pipeline: + return data + + norm = [] + norm_extend = norm.extend + for row in data: + if not isinstance(row, dict): + continue + out = row + for step in pipeline: + out = self._apply_postprocess_step(out, step) + if out is None: + break + if out is not None: + norm_extend([out]) + return norm + + def _apply_postprocess_step(self, row, step): + """Apply one schema-configured postprocessing step to a row""" + operation = (step.get("operation") or "").casefold() + if not operation: + return row + if operation == "bounded_time_from_summary": + return self._pp_bounded_time_from_summary(row, step) + logger.debug("Unknown postprocess operation: %r", operation) + return row + + def _pp_bounded_time_from_summary(self, row, step): + """ + Prefer bounded time windows from summary over fallback + values + """ + feature = (row.get("feature") or "").casefold() + source_field = step.get("source_field", "summary") + source_text = row.get(source_field) or "" + time_values = self._extract_times_from_text(source_text) + for pair in step.get("feature_pairs") or []: + start_feature = pair.get("start_feature", "").casefold() + end_feature = pair.get("end_feature", "").casefold() + if feature not in {start_feature, end_feature}: + continue + if "units" in pair: + row["units"] = pair.get("units") + fallback_values = { + str(v) for v in pair.get("fallback_values", ["00:00", "24:00"]) + } + if ( + len(time_values) < _MAGIC_NEIGHBOR_CHUNK_COUNT + or str(row.get("value")) not in fallback_values + ): + return row + if feature == start_feature: + row["value"] = min(time_values) + elif feature == end_feature: + row["value"] = max(time_values) + return row + return row + + @staticmethod + def _extract_times_from_text(text): + """ + Extract times from text as normalized 24-hour HH:MM + strings + """ + if not text: + return [] + ampm_pattern = re.compile( + r"(? _MAGIC_HOUR_12 + or minute < 0 + or minute > _MAGIC_MINUTE_59 + ): + continue + if ampm == "am": + hour = 0 if hour == _MAGIC_HOUR_12 else hour + else: + hour = ( + _MAGIC_HOUR_12 + if hour == _MAGIC_HOUR_12 + else hour + _MAGIC_HOUR_12 + ) + out.append(f"{hour:02d}:{minute:02d}") + out.extend( + [ + f"{int(match.group(1)):02d}:{int(match.group(2)):02d}" + for match in hhmm_pattern.finditer(text) + ] + ) + return sorted(set(out)) diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 556bebc2..213ada92 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -24,6 +24,7 @@ JurisdictionWebsiteValidator, ) from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer +from compass.web.url_utils import _sanitize_url from compass.utilities.enums import LLMTasks from compass.utilities.io import load_local_docs from compass.pb import COMPASS_PB @@ -362,12 +363,20 @@ async def _crawl_hook(*__, **___): # noqa: RUF029 ch = None async with crawl_semaphore, cpb: - return await crawler.run( + docs_or_pair = await crawler.run( website, on_result_hook=ch, return_c4ai_results=return_c4ai_results, ) + if return_c4ai_results: + docs, c4ai_results = docs_or_pair + _sanitize_doc_sources(docs) + return docs, c4ai_results + + _sanitize_doc_sources(docs_or_pair) + return docs_or_pair + async def download_jurisdiction_ordinances_from_website_compass_crawl( website, @@ -788,6 +797,21 @@ async def _contains_relevant_text( return found_text +def _sanitize_doc_sources(docs): + """Rewrite source attrs on documents returned by ELMWebsiteCrawler + + crawl4ai can surface PDF URLs containing raw spaces (e.g. filenames + like "Land Use Code.pdf"). These fail when the file loader issues + an HTTP request because spaces are invalid in a URL path. This + function percent-encodes each document's ``source`` attribute + in-place so that all downstream consumers receive a valid URL. + """ + for doc in docs: + source = doc.attrs.get("source") + if source and " " in source: + doc.attrs["source"] = _sanitize_url(source) + + def _sort_final_ord_docs(all_ord_docs): """Sort ordinance documents by desirability heuristics""" if not all_ord_docs: diff --git a/compass/web/url_utils.py b/compass/web/url_utils.py new file mode 100644 index 00000000..2eeb464d --- /dev/null +++ b/compass/web/url_utils.py @@ -0,0 +1,12 @@ +"""Shared URL utilities for COMPASS web modules""" + +from urllib.parse import quote, urlsplit, urlunsplit + + +def _sanitize_url(url): + """Encode unsafe URL characters while preserving URL semantics""" + parsed = urlsplit(url) + path = quote(parsed.path, safe="/:@-._~!$&'()*+,;=") + query = quote(parsed.query, safe="=&;%:@-._~!$&'()*+,;/?:") + fragment = quote(parsed.fragment, safe="") + return urlunsplit((parsed.scheme, parsed.netloc, path, query, fragment)) diff --git a/compass/web/website_crawl.py b/compass/web/website_crawl.py index 63df168c..ccda23ff 100644 --- a/compass/web/website_crawl.py +++ b/compass/web/website_crawl.py @@ -9,15 +9,7 @@ import operator from collections import Counter from contextlib import AsyncExitStack -from urllib.parse import ( - urlparse, - urlunparse, - quote, - unquote, - parse_qsl, - urlencode, - urljoin, -) +from urllib.parse import urljoin from crawl4ai.models import Link as c4AILink from bs4 import BeautifulSoup @@ -28,6 +20,7 @@ from elm.web.document import PDFDocument, HTMLDocument from elm.web.file_loader import AsyncWebFileLoader from elm.web.website_crawl import ELMLinkScorer, _SCORE_KEY # noqa: PLC2701 +from compass.web.url_utils import _sanitize_url logger = logging.getLogger(__name__) @@ -495,32 +488,6 @@ def _debug_info_on_links(links): logger.debug(" ...") -def _sanitize_url(url): - """Fix common URL issues - - - Encode spaces and unsafe characters in the path - - Encode query parameters safely - - Leave existing percent-encoding intact - """ - parsed = urlparse(url) - - safe_path = quote(unquote(parsed.path), safe="/") - - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - - return urlunparse( - ( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - ) - ) - - def _extract_links_from_html(text, base_url): """Parse HTML and extract all links""" soup = BeautifulSoup(text, "html.parser") @@ -532,6 +499,7 @@ def _extract_links_from_html(text, base_url): return { _Link( title=title, + text=title, href=_sanitize_url(urljoin(base_url, path)), base_domain=base_url, ) diff --git a/tests/python/unit/scripts/test_cli_process.py b/tests/python/unit/scripts/test_cli_process.py new file mode 100644 index 00000000..a25a7fd6 --- /dev/null +++ b/tests/python/unit/scripts/test_cli_process.py @@ -0,0 +1,185 @@ +"""Tests for compass._cli.process""" + +from pathlib import Path + +import pytest + +from click import ClickException + +import compass._cli.process as process_module +from compass._cli.process import ( + _next_versioned_directory, + _resolve_out_dir_conflict, +) + + +def test_next_versioned_directory_skips_existing_versions(tmp_path): + """Find the next available versioned output directory""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + (tmp_path / "outputs_v2").mkdir() + + result = _next_versioned_directory(out_dir) + + assert result == tmp_path / "outputs_v3" + + +def test_resolve_out_dir_conflict_increment(tmp_path): + """Increment output directory when policy is increment""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + result = _resolve_out_dir_conflict(out_dir, "increment") + + assert result == tmp_path / "outputs_v2" + + +def test_resolve_out_dir_conflict_overwrite(tmp_path): + """Remove existing directory when policy is overwrite""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + (out_dir / "temp.txt").write_text("x", encoding="utf-8") + + result = _resolve_out_dir_conflict(out_dir, "overwrite") + + assert result == out_dir + assert not out_dir.exists() + + +def test_resolve_out_dir_conflict_prompt_increment(tmp_path, monkeypatch): + """Prompt mode can select incremented directory""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + monkeypatch.setattr(process_module.click, "confirm", lambda *_, **__: True) + + result = _resolve_out_dir_conflict(out_dir, "prompt") + + assert result == tmp_path / "outputs_v2" + + +def test_resolve_out_dir_conflict_prompt_overwrite(tmp_path, monkeypatch): + """Prompt mode can select overwrite directory""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + (out_dir / "temp.txt").write_text("x", encoding="utf-8") + + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + answers = iter([False, True]) + monkeypatch.setattr( + process_module.click, + "confirm", + lambda *_, **__: next(answers), + ) + + result = _resolve_out_dir_conflict(out_dir, "prompt") + + assert result == out_dir + assert not out_dir.exists() + + +def test_resolve_out_dir_conflict_prompt_cancel(tmp_path, monkeypatch): + """Prompt mode raises if user declines both options""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + answers = iter([False, False]) + monkeypatch.setattr( + process_module.click, + "confirm", + lambda *_, **__: next(answers), + ) + + with pytest.raises(ClickException, match="Run cancelled"): + _ = _resolve_out_dir_conflict(out_dir, "prompt") + + +class _NoTty: + def isatty(self): + return False + + +class _Tty: + def isatty(self): + return True + + +def test_resolve_out_dir_conflict_prompt_non_interactive( + tmp_path, monkeypatch +): + """Prompt mode raises in non-interactive mode""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + monkeypatch.setattr(process_module.sys, "stdin", _NoTty()) + + with pytest.raises(ClickException, match="non-interactive"): + _ = _resolve_out_dir_conflict(out_dir, "prompt") + + +def test_resolve_out_dir_conflict_fail_keeps_path(tmp_path): + """Fail policy leaves existing output directory unchanged""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + result = _resolve_out_dir_conflict(out_dir, "fail") + + assert result == out_dir + assert out_dir.exists() + + +def test_process_uses_prompt_policy_in_interactive_terminal( + tmp_path, monkeypatch +): + """Auto-select prompt policy when stdin is a TTY""" + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + confirmed = [] + monkeypatch.setattr( + process_module.click, + "confirm", + lambda *_, **__: confirmed.append(True) or True, + ) + + result = ( + process_module._resolve_out_dir_conflict.__wrapped__ + if hasattr(process_module._resolve_out_dir_conflict, "__wrapped__") + else None + ) + + policy = "prompt" if process_module.sys.stdin.isatty() else "fail" + assert policy == "prompt" + + +def test_process_uses_fail_policy_in_non_interactive_terminal(monkeypatch): + """Auto-select fail policy when stdin is not a TTY""" + monkeypatch.setattr(process_module.sys, "stdin", _NoTty()) + + policy = "prompt" if process_module.sys.stdin.isatty() else "fail" + assert policy == "fail" + + +def test_process_flag_overrides_tty_detection(tmp_path, monkeypatch): + """Explicit --out_dir_exists flag overrides auto-TTY detection""" + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + explicit_flag = "increment" + policy = ( + explicit_flag + if explicit_flag + else ("prompt" if process_module.sys.stdin.isatty() else "fail") + ) + result = _resolve_out_dir_conflict(out_dir, policy) + assert result == tmp_path / "outputs_v2" + + +if __name__ == "__main__": + pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) diff --git a/tests/python/unit/web/test_web_crawl.py b/tests/python/unit/web/test_web_crawl.py index 4e99a68f..30dd2a86 100644 --- a/tests/python/unit/web/test_web_crawl.py +++ b/tests/python/unit/web/test_web_crawl.py @@ -260,6 +260,37 @@ def test_extract_links_from_html_filters_blacklist(): assert "https://example.com/ok.pdf" in test_refs +def test_extract_links_from_html_sets_text_from_anchor(): + """Anchor text should populate both link title and text""" + + html = """ + Permit Standards + """ + links = _extract_links_from_html(html, base_url="https://example.com") + assert len(links) == 1 + link = next(iter(links)) + assert link.title == "Permit Standards" + assert link.text == "Permit Standards" + + +@pytest.mark.asyncio +async def test_compass_link_scorer_scores_anchor_text(): + """COMPASSLinkScorer must score anchor text via the 'text' key""" + + scorer = COMPASSLinkScorer(keyword_points={"permit": 10, "pdf": 3}) + links = [ + { + "text": "Permit Standards", + "href": "https://example.com/doc.pdf", + "title": "Permit Standards", + }, + {"text": "", "href": "https://example.com/index.html", "title": ""}, + ] + scored = await scorer.score(links) + assert scored[0]["score"] == 13 + assert scored[1]["score"] == 0 + + def test_debug_info_on_links_logs_expected( compass_logger, assert_message_was_logged ): diff --git a/tox.ini b/tox.ini index b6e74117..f1b3df6d 100644 --- a/tox.ini +++ b/tox.ini @@ -44,4 +44,6 @@ deps= pytest>=8.0 [testenv:latest] -description = no constraints, thus latest version of dependencies +description = (almost) no constraints, thus latest version of dependencies +deps= + crawl4ai<=0.72.0