From 93415ce7e9a3e599bb591d81d6b77f1b5b04edcc Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 17 Mar 2026 12:11:08 -0600 Subject: [PATCH 01/21] Add geothermal extraction integration artifacts --- .../geothermal_config.json5 | 28 ++ .../geothermal_plugin_config.yaml | 206 ++++++++++ .../geothermal_schema.json | 379 ++++++++++++++++++ 3 files changed, 613 insertions(+) create mode 100644 compass/extraction/geothermal_electricity/geothermal_config.json5 create mode 100644 compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml create mode 100644 compass/extraction/geothermal_electricity/geothermal_schema.json diff --git a/compass/extraction/geothermal_electricity/geothermal_config.json5 b/compass/extraction/geothermal_electricity/geothermal_config.json5 new file mode 100644 index 00000000..5bd8037b --- /dev/null +++ b/compass/extraction/geothermal_electricity/geothermal_config.json5 @@ -0,0 +1,28 @@ +// geothermal_config_demo.json5 +{ + "out_dir": "./outputs_geothermal_dev_one", + "tech": "geothermal_electricity", + "jurisdiction_fp": "./geothermal_jurisdictions_one.csv", + "search_engines": [ + { + "se_name": "SerpAPIGoogleSearch", + "verify": false + } + ], + "file_loader_kwargs": { + "verify_ssl": false + }, + "model": [ + { + "name": "compassop-gpt-4.1-mini", + "llm_call_kwargs": { + "temperature": 0, + "timeout": 600 + }, + "client_kwargs": { + "api_version": "2025-04-01-preview", + "azure_endpoint": "https://aoai-prod-eastus-compassop-001.openai.azure.com/" + } + } + ] +} diff --git a/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml new file mode 100644 index 00000000..a983158a --- /dev/null +++ b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml @@ -0,0 +1,206 @@ +schema: ./geothermal_schema.json + +data_type_short_desc: utility-scale geothermal electricity ordinance + +cache_llm_generated_content: true + +query_templates: + - "filetype:pdf {jurisdiction} geothermal power plant ordinance" + - "geothermal electricity generation ordinance {jurisdiction}" + - "{jurisdiction} geothermal energy facility zoning ordinance" + - "{jurisdiction} geothermal power plant land use code" + - "{jurisdiction} geothermal code of ordinances" + - "{jurisdiction} geothermal conditional use permit" + - "{jurisdiction} geothermal special use permit" + - "{jurisdiction} geothermal drilling permit regulations" + - "{jurisdiction} geothermal resource development statute" + - "Where can I find the legal text for geothermal power plant zoning ordinances in {jurisdiction}?" + - "What is the specific legal information regarding zoning ordinances for geothermal electricity generation facilities in {jurisdiction}?" + +website_keywords: + pdf: 92160 + geothermal: 46080 + ordinance: 23040 + zoning: 11520 + regulation: 5760 + code: 2880 + power: 1440 + electricity: 1440 + planning: 720 + permit: 720 + land use: 720 + municipal: 720 + county: 360 + ordinance code: 360 + code of ordinances: 360 + land use code: 360 + use table: 360 + chapter: 180 + article: 180 + title: 180 + statute: 180 + administrative code: 180 + conditional use permit: 180 + special use permit: 180 + drilling permit: 180 + resource development: 180 + government: 180 + +heuristic_keywords: + good_tech_keywords: + - "wellfield" + - "well field" + - "production well" + - "geothermal exploration" + - "geothermal generating" + - "geothermal generation" + - "geothermal power" + - "geothermal production" + - "geothermal project" + - "geothermal overlay zone" + - "geothermal power plant" + - "geothermal facility" + - "geothermal electric" + - "geothermal energy facility" + - "steam turbine" + - "binary cycle" + - "flash steam" + - "dry steam" + - "enhanced geothermal" + - "reservoir temperature" + - "brine" + - "reinjection well" + - "production zone" + - "geothermal resource" + - "geothermal production project" + - "geothermal drilling" + - "exploratory well" + - "injection well" + - "geothermal lease" + - "drilling permit" + - "plan of utilization" + - "known geothermal resource" + - "geothermal development" + - "geothermal well" + - "geothermal reservoir" + - "geothermal permit" + - "geothermal ordinance" + - "geothermal zoning" + - "code of ordinances" + - "land use code" + - "use table" + - "zoning ordinance" + - "special use permit" + - "conditional use permit" + good_tech_acronyms: + - "egs" + - "kgra" + good_tech_phrases: + - "geothermal power plant" + - "geothermal electricity generation" + - "geothermal energy facility" + - "geothermal resource development" + - "known geothermal resource area" + - "binary cycle" + - "flash steam" + - "dry steam" + - "steam turbine" + - "plan of utilization" + - "land use code" + - "code of ordinances" + - "zoning ordinance" + - "special use permit" + - "conditional use permit" + not_tech_words: + - "geothermal heat pump" + - "ground source heat pump" + - "ground-source heat pump" + - "ghp" + - "ground heat pump" + - "gshp" + - "ground-coupled heat pump" + - "ground coupled heat pump" + - "earth-coupled heat pump" + - "earth-source heat pump" + - "geoexchange" + - "geo-exchange" + - "closed loop" + - "closed-loop" + - "open loop" + - "vertical loop" + - "horizontal loop" + - "heating and cooling" + - "hvac" + - "space heating" + - "water heating" + - "direct use" + - "direct-use" + - "district heating" + - "greenhouse heating" + - "residential geothermal" + - "accessory use" + - "energy star" + - "solar panel" + - "solar array" + - "solar farm" + - "solar energy system" + - "solar energy facility" + - "photovoltaic" + - "net metering" + - "solar collector" + - "solar ordinance" + - "wind energy" + - "wind farm" + - "wind turbine" + - "wind energy system" + - "wind energy facility" + - "wind energy conversion" + - "wind ordinance" + - "anemometer tower" + - "meteorological tower" + - "rotor diameter" + - "tip height" + - "nacelle" + - "battery storage" + - "energy storage system" + - "hydroelectric" + - "biomass" + +collection_prompts: true + +text_extraction_prompts: true + +extraction_system_prompt: |- + You are a legal scholar extracting structured data from geothermal + electricity ordinances and regulations. + + Be focused and literal: extract only enacted, explicit, in-scope + requirements. + Be thorough and complete: review all relevant sections, including tables, + use tables, footnotes, and lists, so no explicitly stated feature is + missed. + Before finalizing, perform a feature-coverage check against the schema enum + and ensure each explicitly supported feature is captured at most once. + + Follow all schema instructions exactly. + Extract only enacted requirements that apply to utility-scale geothermal + electricity generation, geothermal exploration or drilling, geothermal + wells, geothermal power plants, or directly associated generation + facilities such as substations or gen-tie lines when the ordinance + explicitly governs them. + + Do not extract rules that apply only to geothermal heat pumps, HVAC, + direct-use geothermal, district heating, residential systems, or other + non-generation technologies. + + State-level statutes or regulations are valid when they govern local or + project-level siting, permitting, drilling, setbacks, environmental + controls, decommissioning, or operating requirements. + + Prefer explicit enacted values over interpretation. + Use null values and null units for qualitative obligations when the + summary carries the operative legal requirement. + Treat fracking or hydraulic-fracturing language as in scope only when the + ordinance explicitly uses it to regulate or prohibit geothermal + electricity development. + Keep summaries source-faithful and include important conditions. diff --git a/compass/extraction/geothermal_electricity/geothermal_schema.json b/compass/extraction/geothermal_electricity/geothermal_schema.json new file mode 100644 index 00000000..6de8d44f --- /dev/null +++ b/compass/extraction/geothermal_electricity/geothermal_schema.json @@ -0,0 +1,379 @@ +{ + "title": "Geothermal Electricity Ordinance Extraction Schema", + "description": "Single-shot structured extraction schema for utility-scale geothermal electricity ordinances. This schema guides an LLM to extract all relevant features in one call and returns an outputs array where each object represents one row in the extracted long-form table.", + "version": "2.1.0", + "type": "object", + "required": ["outputs"], + "additionalProperties": false, + "properties": { + "outputs": { + "type": "array", + "description": "Sparse long-form extraction table. Include only features with an enacted, explicit requirement and emit at most one row per feature. Never infer, imply, or guess a requirement from related context.", + "items": { + "type": "object", + "required": [ + "feature", + "value", + "units", + "section", + "summary", + "explanation" + ], + "additionalProperties": false, + "properties": { + "feature": { + "type": "string", + "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs.", + "enum": [ + "structures residential zones setback", + "property lines setback", + "roads setback", + "railroads setback", + "existing transmission lines setback", + "water bodies setback", + "combustible tanks setback", + "domestic wells setback", + "active faults setback", + "schools setback", + "hospitals setback", + "drilling hours", + "noise", + "maximum height", + "minimum lot size", + "fencing", + "color requirements", + "lighting requirements", + "visual impact assessment", + "seismic monitoring plan", + "primary use districts", + "special use districts", + "accessory use districts", + "prohibited use districts", + "permit requirement", + "bond requirement", + "decommissioning", + "prohibitions", + "ordinance date" + ] + }, + "value": { + "description": "The extracted ordinance value. For numerical setbacks and limits, use a number. For permit or district lists, use an array of strings. For time windows, date language, or other categorical outcomes, use a string. Use null only for qualitative features, and only when an enacted, explicit, enforceable ordinance requirement for that feature is present. Null must never be used to indicate absence. If a feature has no enacted, explicit requirement in the ordinance text, omit that feature from outputs.", + "anyOf": [ + { + "type": "number" + }, + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + }, + { + "type": "null" + } + ] + }, + "units": { + "type": ["string", "null"], + "description": "Units for the extracted value. Preserve ordinance wording when possible. For setbacks and height, use linear units such as 'feet' or 'meters' as stated. For minimum lot size, use area units such as 'acres' or 'square feet'. For noise, use 'dBA' only if the ordinance says 'dBA' or 'dB(A)'; if it says 'dB' without A-weighting, keep 'dB'. Use null for district lists, permit lists, dates, and qualitative requirements without measurable units. For drilling-hours strings, use null unless the ordinance states a measurable hour cap rather than an operating window." + }, + "section": { + "type": ["string", "null"], + "description": "The section title or number where the requirement appears. Include labels or numbers if available. Null if no section identifier is available." + }, + "summary": { + "type": "string", + "description": "A short summary with direct ordinance excerpts or quotes whenever possible. For qualitative features such as permitting, fencing, lighting, seismic monitoring, decommissioning, and prohibitions, this is the primary output field and should contain direct ordinance language. For numeric features, summary must support the same requirement used to extract value and units. Must be a non-null, non-empty string. Do not output absence placeholders such as 'No explicit requirement found'; omit the feature instead when no requirement is present." + }, + "explanation": { + "type": "string", + "description": "Brief rationale explaining why this row matches the selected feature under this schema. Reference the specific evidence in summary and how it supports the extracted value and units or, for qualitative features, the inclusion criteria. Must be a non-null, non-empty string and must not use absence placeholders." + } + } + } + } + }, + "$core_principles": { + "scope_context": { + "description": "Only extract requirements that apply to utility-scale geothermal electricity generation and directly associated geothermal electricity infrastructure, including exploration or drilling operations, production or injection wells, geothermal power plants, associated substations, and gen-tie lines only when the ordinance text explicitly governs them. Exclude geothermal heat pumps, HVAC, direct-use geothermal, district heating, greenhouse heating, residential geothermal, and other non-generation systems unless the ordinance explicitly governs utility-scale electricity generation as well. State statutes and regulations are in scope when they impose enforceable siting, zoning, drilling, permitting, monitoring, bonding, setback, or decommissioning requirements on geothermal electricity projects in the jurisdiction. This schema must work across nationwide ordinance styles, including county, municipal, township, parish, borough, tribal, and state-level regulatory text." + }, + "nationwide_jurisdiction_handling": { + "description": "Preserve the ordinance's own governance vocabulary instead of normalizing it to one jurisdiction type. Districts may appear as zoning districts, use districts, overlays, resource areas, exclusive farm use zones, planned development areas, or similar land-use categories. Permits may appear as conditional use permits, special use permits, administrative permits, use permits, site certificates, plans of operation, drilling authorizations, county approvals, state board approvals, or similar authorizations. If a jurisdiction is effectively unzoned or the controlling requirements come from state regulation without district tables, omit district features rather than forcing a district classification." + }, + "strict_evidence_gate": { + "description": "Extract a feature only when the ordinance text explicitly states a requirement, permission, district allowance, prohibition, date, or other operative rule for that same feature. Never infer, assume, extrapolate, or guess from related context, implications, headings, or nearby provisions. Tables, matrices, footnotes, appendices, and labeled exhibits count when they state the operative requirement. If the ordinance points to an outside document or technical standard without restating the controlling requirement in the ordinance text itself, do not import missing values from that outside source." + }, + "data_omission": { + "description": "Emit only positively matched features. If a feature is not explicitly present, omit it entirely rather than returning placeholder text. For qualitative features, use value=null and units=null only when an enacted, explicit requirement for that same feature is present. For numeric features, extract only when an explicit numeric threshold is stated in the ordinance text; otherwise omit the feature instead of returning null, empty, or qualitative-only values. Never emit absence placeholders such as 'not found', 'no explicit requirement', 'none', or similar text in any field." + }, + "numeric_prioritization": { + "description": "When multiple numeric values apply to the same feature, keep one row and select the controlling most restrictive value for that feature. Restrictiveness rules: setbacks choose the largest minimum separation distance; noise choose the lowest allowed numeric limit; maximum height choose the lowest maximum height; minimum lot size choose the highest minimum lot size. Keep condition-specific alternatives in summary only when the ordinance text explicitly shows they all apply to the same geothermal electricity feature." + }, + "district_and_permit_routing": { + "description": "Use the district features only for zoning districts, overlay districts, or land-use districts that explicitly classify geothermal electricity facilities. Use 'primary use districts' when the use is allowed by right or as a principal permitted use. Use 'special use districts' when the use requires a conditional use permit, special use permit, or comparable discretionary approval. Use 'accessory use districts' only when the ordinance explicitly allows geothermal electricity as an accessory or secondary use. Use 'prohibited use districts' only for districts where geothermal electricity facilities are explicitly and unconditionally prohibited. Use 'permit requirement' for explicit permit, approval, site-plan, zoning-permit, county-permit, drilling-permit, or similar authorization requirements. Do not route permit requirements into district features, and do not route district tables into permit features." + }, + "prohibition_boundary": { + "description": "Use 'prohibitions' only for currently effective bans or moratoria on geothermal electricity exploration, drilling, well development, power plant siting, facility construction, or related project deployment. A ban on hydraulic fracturing or fracking counts only when the ordinance explicitly uses that ban to prohibit, limit, or condition geothermal electricity development. Do not treat ordinary permit conditions, environmental standards, or operational restrictions as prohibitions when the project remains allowable subject to compliance." + } + }, + "$definitions": { + "setback_features": { + "description": "Setback features for geothermal electricity facilities and related infrastructure. Treat each setback feature independently and do not cross-apply a setback unless the ordinance text explicitly states that it applies to multiple target types. When a single clause explicitly lists multiple target types and one shared numeric setback, emit one row per explicitly listed feature using the same numeric value and units and cite the same clause in summary. Apply the shared numeric prioritization rules in $core_principles when multiple numeric values explicitly apply to the same feature.", + "properties": { + "structures residential zones setback": { + "description": "Minimum required separation from structures, dwellings, occupied buildings, residences, homes, residential receptors, residential uses, or residential zoning districts. Extract this feature only when the ordinance explicitly ties the setback to structures, residences, occupied buildings, or residential zones. If one clause applies a common setback to multiple structure-like receptors such as homes, occupied buildings, and residential districts, keep one row under this feature and preserve the exact receptor list in summary. Do not map generic property line or district boundary setbacks here unless the text explicitly names structures or residential zones." + }, + "property lines setback": { + "description": "Minimum required separation from property lines, lot lines, parcel boundaries, or lease boundaries when the ordinance explicitly states the setback is measured from that boundary. Do not remap property-line setbacks to roads, transmission lines, or residential zones unless the text explicitly makes them equivalent for that requirement." + }, + "roads setback": { + "description": "Minimum required separation from public roads, road rights-of-way, streets, highways, named roadway corridors, or similar transportation corridors explicitly framed as roads. Property-line setbacks do not count for this feature unless the ordinance text explicitly states that the property line is the road right-of-way or otherwise makes them the same boundary for that requirement." + }, + "railroads setback": { + "description": "Minimum required separation from railroads, railroad rights-of-way, rail corridors, or active rail lines. Extract only when rail infrastructure is explicitly named." + }, + "existing transmission lines setback": { + "description": "Minimum required separation from existing transmission lines, transmission corridors, substations, or other existing electric transmission infrastructure when explicitly named. Use this feature for geothermal generation or gen-tie line setbacks from existing transmission assets only when the ordinance text expressly states the setback." + }, + "water bodies setback": { + "description": "Minimum required separation from rivers, streams, lakes, ponds, wetlands, reservoirs, shorelines, floodplains, springs, or other water bodies explicitly named in the ordinance. Do not map domestic water well setbacks here." + }, + "combustible tanks setback": { + "description": "Minimum required separation from combustible tanks, fuel tanks, flammable storage tanks, petroleum tanks, or similar combustible storage infrastructure. Extract only when that storage infrastructure is explicitly named." + }, + "domestic wells setback": { + "description": "Minimum required separation from domestic wells, private wells, drinking water wells, household wells, or other non-production water supply wells explicitly identified as domestic or private. Do not map production, injection, monitoring, or geothermal wells here." + }, + "active faults setback": { + "description": "Minimum required separation from active faults, known faults, fault traces, seismic hazard zones, or similar geologic fault features when explicitly named as a setback or exclusion distance." + }, + "schools setback": { + "description": "Minimum required separation from schools, school properties, school buildings, educational campuses, or similar school uses explicitly named in the ordinance." + }, + "hospitals setback": { + "description": "Minimum required separation from hospitals, medical centers, clinics, nursing facilities, or similar health-care institutions explicitly named in the ordinance." + } + } + }, + "numerical_features": { + "description": "Non-setback numerical restriction features. Only extract if numerical values are explicitly given in the ordinance text.", + "properties": { + "noise": { + "description": "Extract maximum allowable operational noise for geothermal electricity facilities only when an explicit numeric limit is stated. Keep units exactly as written in the text. If the ordinance only references compliance with external standards or provides no numeric noise limit, omit this feature entirely." + }, + "maximum height": { + "description": "Extract maximum allowed structure, drill rig, stack, tower, cooling equipment, or facility height only when an explicit numeric cap is stated. If multiple height caps apply to the same geothermal electricity feature, keep the lowest maximum and list the alternatives in summary." + }, + "minimum lot size": { + "description": "Extract the minimum lot, parcel, tract, or site area required for geothermal electricity facilities only when an explicit numeric minimum is stated. If multiple minimum sizes apply, keep the highest minimum and list condition-specific alternatives in summary." + } + } + }, + "time_window_features": { + "description": "Time-of-day or day-of-week operational schedule restrictions.", + "properties": { + "drilling hours": { + "description": "Extract explicit drilling-hours requirements, including allowed or prohibited hours of operation, days of week, holiday restrictions, and emergency exceptions. Use a string value when the ordinance states a time window or schedule, and use units=null. If the ordinance instead states a numeric cap on hours per day or week, you may use a numeric value with units such as 'hours per day' or 'hours per week'. Do not infer drilling-hours limits from general noise rules unless the ordinance explicitly sets drilling-hour restrictions." + } + } + }, + "qualitative_features": { + "description": "Operational, design, permit, monitoring, and end-of-life requirements that are primarily textual obligations.", + "properties": { + "fencing": { + "description": "Extract explicit fencing, enclosure, controlled-access, or perimeter barrier requirements for geothermal electricity facilities, wells, plants, or related equipment. Do not map generic security plans here unless the ordinance explicitly requires fencing or enclosure." + }, + "color requirements": { + "description": "Extract explicit color, paint, finish, or color-treatment requirements for geothermal electricity structures or equipment. Do not map generic visual mitigation here unless the ordinance expressly requires a color treatment." + }, + "lighting requirements": { + "description": "Extract explicit lighting requirements, shielding, directionality, glare controls, night-lighting limits, or aviation-lighting conditions for geothermal electricity facilities or related equipment." + }, + "visual impact assessment": { + "description": "Extract explicit requirements for a visual impact assessment, visual resource study, scenic impact analysis, photo simulation, line-of-sight analysis, or similar visual review document. Do not use this feature for ordinary screening requirements unless the ordinance explicitly requires an assessment or study." + }, + "seismic monitoring plan": { + "description": "Extract explicit requirements for a seismic monitoring plan, induced seismicity monitoring plan, geophysical monitoring plan, or similar seismicity-management document or program. Do not use this feature for generic environmental monitoring unless the ordinance explicitly ties it to seismic or fault-related monitoring." + }, + "permit requirement": { + "description": "Extract explicit permit, approval, authorization, review, certification, or comparable entitlement requirements for geothermal electricity facilities. This includes conditional use permits, special use permits, zoning permits, drilling permits, county permits, municipal permits, use permits, site plan approvals, plans of operation, state board approvals, siting certificates, and similar authorizations. Preserve the ordinance's exact permit or approval names in value. Use an array of strings when multiple permit types are explicitly required." + }, + "bond requirement": { + "description": "Extract explicit bonding, financial assurance, surety, letter-of-credit, security, escrow, reclamation guarantee, decommissioning guarantee, or similar assurance requirements. Use value=null and units=null unless the ordinance states a specific numeric amount or formula that should be preserved in summary. If the ordinance uses a formula, engineer estimate, inflation adjustment, or agency-determined amount instead of a fixed number, keep that logic in summary rather than forcing a numeric value." + }, + "decommissioning": { + "description": "Extract requirements for decommissioning, abandonment, plugging, removal, reclamation, site restoration, salvage, or retirement of geothermal electricity facilities, wells, plants, or associated infrastructure, including responsible party and timeline details when explicitly stated." + } + } + }, + "district_features": { + "description": "Zoning, overlay, and land-use district allowances for geothermal electricity facilities.", + "properties": { + "primary use districts": { + "description": "Extract all districts, zones, overlays, resource areas, or similar land-use categories where geothermal electricity facilities are explicitly allowed by right, as a principal permitted use, or under an overlay that functions as a primary-use authorization. Use an array of district names in value. Preserve the exact district names or codes from the ordinance text and describe the allowance in summary." + }, + "special use districts": { + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are allowed only through special use, conditional use, special exception, discretionary review, or comparable approval. Use an array of district names in value and preserve the approval posture in summary. Do not route the permit type itself here unless it is part of the district allowance language." + }, + "accessory use districts": { + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are explicitly allowed only as an accessory, incidental, or subordinate use to another principal use. Use an array of district names in value and preserve the accessory-use condition in summary." + }, + "prohibited use districts": { + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are explicitly and unconditionally prohibited. Use an array of district names in value and quote the prohibitory language in summary. Do not include districts where geothermal electricity may still be allowed with a permit, variance, overlay, or other conditional path." + } + } + }, + "prohibition_features": { + "description": "Prohibitions, bans, or moratoria on geothermal electricity exploration, drilling, facility siting, or deployment.", + "properties": { + "prohibitions": { + "description": "Extract currently effective bans, moratoria, or explicit prohibitions on geothermal electricity exploration, drilling, well development, plant construction, or facility siting. Include fracking or hydraulic-fracturing bans only when the ordinance explicitly uses them to ban, limit, or condition geothermal electricity activity. If there are carve-outs, exceptions, or conditional permitting paths that still allow the project, do not classify the rule as a prohibition." + } + } + }, + "date_features": { + "description": "Effective, enacted, amended, or modified date information for the operative ordinance text.", + "properties": { + "ordinance date": { + "description": "Extract the ordinance enactment, adoption, amendment, or last-modified date when the source text explicitly states it. Use a string value that preserves the ordinance date wording exactly as written, and use units=null. Prefer the current operative enactment or amendment date over superseded historical dates unless the ordinance text clearly indicates multiple active dates are relevant." + } + } + } + }, + "$examples": [ + { + "outputs": [ + { + "feature": "property lines setback", + "value": 500, + "units": "feet", + "section": "Section 8.4 - Setbacks", + "summary": "'Geothermal production wells and associated facilities shall be set back at least 500 feet from all property lines.'", + "explanation": "The excerpt states an explicit numeric minimum separation from property lines, so it maps directly to 'property lines setback' with value 500 feet." + }, + { + "feature": "special use districts", + "value": [ + "Industrial", + "Agricultural" + ], + "units": null, + "section": "Table 4 - Allowed Uses", + "summary": "'Geothermal power plants may be approved as conditional uses in the Industrial and Agricultural districts subject to county review.'", + "explanation": "The ordinance explicitly lists two districts where geothermal power plants are allowed only through conditional-use approval, so this belongs under 'special use districts' with the district names preserved as an array." + }, + { + "feature": "permit requirement", + "value": [ + "conditional use permit", + "county drilling permit" + ], + "units": null, + "section": "Section 5.2 - Approval Process", + "summary": "'A conditional use permit and county drilling permit shall be obtained prior to the construction or operation of any geothermal power plant or exploratory well.'", + "explanation": "The clause explicitly requires two project approvals for geothermal electricity development, so it belongs under 'permit requirement' with both permit names preserved as an array." + }, + { + "feature": "drilling hours", + "value": "7:00 a.m. to 7:00 p.m. Monday through Saturday, excluding holidays", + "units": null, + "section": "Section 6.7 - Drilling Operations", + "summary": "'Routine geothermal drilling activities may occur only between 7:00 a.m. and 7:00 p.m., Monday through Saturday, and shall not occur on Sundays or legal holidays except in an emergency.'", + "explanation": "The ordinance states an explicit operating window for geothermal drilling, so this belongs under 'drilling hours' as a schedule string rather than a numeric cap." + }, + { + "feature": "bond requirement", + "value": null, + "units": null, + "section": "Section 9.3 - Financial Assurance", + "summary": "'Prior to permit issuance, the operator shall provide financial assurance in a form acceptable to the state oil, gas, and geothermal agency in an amount sufficient to cover plugging, abandonment, reclamation, and decommissioning costs as determined by the agency engineer.'", + "explanation": "The clause imposes an enforceable financial assurance requirement but leaves the amount to an agency-determined formula, so it fits 'bond requirement' with value and units set to null." + }, + { + "feature": "ordinance date", + "value": "Amended April 12, 2024", + "units": null, + "section": "Ord. No. 2024-11", + "summary": "'Ord. No. 2024-11, amended April 12, 2024.'", + "explanation": "The excerpt explicitly states the operative amendment date for the ordinance, so it matches 'ordinance date'." + } + ] + } + ], + "$instructions": { + "general": [ + "Use direct text excerpts and quotes in summary whenever possible.", + "Each feature may appear at most once in outputs; do not emit multiple rows for the same feature. If multiple ordinance lines map to one feature, build a temporary map keyed by feature, aggregate all evidence clauses under that feature key, consolidate into one row, and keep the controlling most restrictive value in value while listing alternatives in summary.", + "For any numeric feature, the summary must support the same requirement that produced value and units for that row. Never pair a numeric value from one clause with qualitative-only language from another clause that has no numeric threshold.", + "Summary is the primary data carrier for all features in this schema; every row must have a non-null, non-empty string for summary.", + "Every row must include an explanation that briefly justifies why the cited summary evidence matches the selected feature under this schema's rules.", + "Emit only positively matched features. Never emit a row to explain why a feature does not apply.", + "The outputs array is a sparse long-form extraction table and does not need to contain every enumerated feature.", + "Tables, table footnotes, and labeled graphics count as valid ordinance evidence when they state the controlling requirement; preserve the relevant table cell or footnote context in summary.", + "Preserve exact local or state regulatory terminology in summary and, where applicable, in value. Do not rename district categories, permit names, or agency approvals into a preferred local template.", + "If ordinance text shows an amended or superseding requirement, extract the current operative requirement as written rather than a superseded historical value unless the ordinance text clearly keeps both rules active.", + "If text is suggestive but not explicit for the target feature, omit the feature." + ], + "setbacks": [ + "Setbacks should be extracted as minimum separation distances.", + "Prefer numeric values with units such as 'feet' or 'meters'.", + "Setback rows must contain numeric value and non-null units; never emit qualitative-only setback rows.", + "If both general and condition-specific setbacks are provided, select the controlling most restrictive value for the geothermal electricity scenario and describe conditions in summary.", + "Do not infer one setback feature from another. A property-line setback is not a structures setback, and a roads setback is not a railroad setback, unless the ordinance text explicitly says so.", + "When one setback clause explicitly names multiple target features and provides one shared numeric threshold, emit one row per explicitly named feature using the same threshold and units." + ], + "numerical": [ + "Numerical features in this schema are the eleven setback features plus noise, maximum height, and minimum lot size.", + "For noise, maximum height, and minimum lot size, extract only explicit numeric thresholds. If the ordinance gives only narrative standards or references other codes without restating the threshold, omit the feature.", + "For drilling-hours requirements expressed as time windows or allowed days, use a string value and units=null. Use a numeric value only when the ordinance states a clear numeric cap such as hours per day." + ], + "qualitative": [ + "For qualitative features, output only when an explicit enforceable requirement is present.", + "For fencing, color requirements, lighting requirements, visual impact assessment, seismic monitoring plan, bond requirement, and decommissioning, prefer value=null and units=null unless the ordinance states a specific numeric threshold or an explicit list that should be preserved in value.", + "For permit requirement, use a string for one permit type and an array of strings when multiple permit or approval types are explicitly required.", + "For bond requirement, preserve formulas, engineer estimates, inflation adjustments, agency-set amounts, and similar non-fixed sizing logic in summary instead of forcing a numeric amount.", + "Do not map generic application materials, narrative findings, or descriptive recitals into these features unless the ordinance explicitly makes them enforceable requirements." + ], + "districts": [ + "For all district features, use an array of district or zone names and set units to null.", + "Use the exact district names or codes as they appear in the ordinance text whenever possible.", + "Use 'primary use districts' for by-right or principal-use authorization, 'special use districts' for conditional or discretionary authorization, 'accessory use districts' for accessory-only authorization, and 'prohibited use districts' for unconditional district-level bans.", + "Preserve the legal approval posture in summary, but keep only the district names in value.", + "If the jurisdiction is unzoned, statewide, or otherwise does not use district-style land-use categories for the operative geothermal rule, omit district features rather than inventing a district mapping.", + "If the ordinance does not explicitly list district names, omit the feature rather than paraphrasing a generic zoning statement." + ], + "prohibitions": [ + "Classify prohibitions as currently effective bans or moratoria on geothermal electricity exploration, drilling, well development, facility siting, or deployment.", + "If no active prohibition is found, omit the feature rather than using placeholder values.", + "Distinguish between complete prohibition and conditional permitting. Conditional permitting is not a ban.", + "Do not treat ordinary operational, environmental, design, monitoring, or permit conditions as prohibitions when the ordinance still allows the project to proceed subject to compliance.", + "A fracking ban belongs here only when the ordinance explicitly uses it to regulate geothermal electricity development." + ], + "dates": [ + "For 'ordinance date', preserve the date wording exactly as written in the ordinance text and use units=null.", + "Prefer the current operative enacted, amended, or modified date over historical prior dates unless the source text clearly indicates multiple active dates are relevant to the current ordinance provision." + ] + }, + "$qa_checklist": [ + "Enforce uniqueness by feature with len(outputs) == len(unique(feature values)); if duplicates exist, merge or drop invalid rows until equality is true.", + "Every row must have non-null, non-empty strings for summary and explanation.", + "Explanation must explicitly tie summary evidence to the selected feature and must not contradict feature inclusion or exclusion criteria.", + "For every numeric feature row, require numeric value and non-null units.", + "Remove any numeric-feature row derived only from qualitative language when no numeric threshold is quoted.", + "If summary or explanation indicates the feature is not applicable, omit the row.", + "If a feature fails any check, omit it rather than returning a partial row." + ], + "$qualitative_features": [ + "fencing", + "color requirements", + "lighting requirements", + "visual impact assessment", + "seismic monitoring plan", + "bond requirement", + "decommissioning", + "prohibitions" + ] +} From d9e868de5a567380a17c62c07f6c2d7b3b0ad318 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 17 Mar 2026 12:11:12 -0600 Subject: [PATCH 02/21] Add COMPASS workflow skills --- .github/skills/extraction-run/SKILL.md | 253 ++++++++++++++++++++++++ .github/skills/schema-creation/SKILL.md | 173 ++++++++++++++++ .github/skills/web-scraper/SKILL.md | 134 +++++++++++++ .github/skills/yaml-setup/SKILL.md | 199 +++++++++++++++++++ 4 files changed, 759 insertions(+) create mode 100644 .github/skills/extraction-run/SKILL.md create mode 100644 .github/skills/schema-creation/SKILL.md create mode 100644 .github/skills/web-scraper/SKILL.md create mode 100644 .github/skills/yaml-setup/SKILL.md diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md new file mode 100644 index 00000000..23321b46 --- /dev/null +++ b/.github/skills/extraction-run/SKILL.md @@ -0,0 +1,253 @@ +--- +name: extraction-run +description: Execute one-shot extraction with COMPASS, evaluate outputs, and iterate schema/config changes with minimal cost. +--- + +# Extraction Run Skill + +Use this skill to run one-shot extraction in a repeatable, low-risk way, +then iterate quickly until you have stable structured outputs. + +## When to use + +- Schema exists and plugin config points to it. +- You are onboarding a new technology (for example geothermal, CHP, hydrogen). +- You need a reliable smoke-test workflow before scaling. + +## Two-pipeline modes + +COMPASS supports two distinct extraction pipelines. Choose one and do not mix +them for the same technology: + +| Mode | Where code lives | Good for | +|---|---|---| +| **One-shot (schema-based)** | `examples/` → `compass/extraction//` | New techs, no Python changes | +| **Legacy decision-tree** | Python code in `compass/extraction//` | Existing solar, wind, small wind | + +One-shot is the correct path for all new technology onboarding. It requires +only a schema JSON, a plugin YAML, and a run config — no Python source changes. + +## Tech promotion lifecycle + +New technology assets start in `examples/` and finish in `compass/extraction/`: + +1. **Develop** — place all assets in `examples/one_shot_schema_extraction_/` +2. **Stabilize** — iterate schema/plugin until smoke and robustness gates pass +3. **Promote** — copy the three finalized files into `compass/extraction//`: + - `_schema.json` + - `_plugin_config.yaml` + - `_config.json5` (optional; useful as a reference run config) + +The promoted extraction folder contains only config files — no Python code is +needed for one-shot techs. + +## Required inputs + +- Run config for `compass process`. +- Plugin config containing `schema`. +- API keys in environment (never hardcode in configs). +- A jurisdiction set sized to the current phase. + +## Naming convention + +Use tech-first names for all one-shot assets: + +- `_config*.json5` +- `_plugin_config.yaml` +- `_schema.json` +- `_jurisdictions*.csv` + +The `tech` value in the run config must be a string that becomes the plugin +registry identifier. It must be unique, lowercase, and underscore-separated +(for example `concentrating_solar`, `geothermal_electricity`). COMPASS will +raise `Unknown tech input` if this key does not match any registered plugin. + +## Canonical development pattern + +For early development, start with the proven dynamic baseline, then fall back +to deterministic mode only when search infrastructure is unstable: + +1. Use one small jurisdiction file (1-3 rows). +2. Use your preferred configured search engine. +3. Load `.env` into shell (`set -a && source .env && set +a`). +4. Run with verbose logs: + - `pixi run compass process -c config.json5 -p plugin.yaml -v` +5. Confirm output artifacts exist before tuning schema semantics. + +Fallback mode when needed: + +- Add `known_doc_urls` (or `known_local_docs`) in run config. +- Set `perform_se_search: false` and `perform_website_search: false`. + +## Adaptation rule + +When adapting this workflow for a new technology, keep the run structure +unchanged and swap only technology-specific inputs: + +- `tech` in run config, +- schema file, +- plugin descriptor (`data_type_short_desc`), +- retrieval query/keyword vocabulary, +- known document URL set. + +Change one axis per run unless debugging infrastructure failures. + +## Example references (optional) + +- `examples/one_shot_schema_extraction/README.rst` +- `examples/one_shot_schema_extraction_geothermal/geothermal_config.json5` +- `examples/one_shot_schema_extraction_geothermal/geothermal_plugin_config.yaml` +- `examples/one_shot_schema_extraction_geothermal/geothermal_schema.json` +- `examples/one_shot_schema_extraction_geothermal/geothermal_jurisdictions_one.csv` +- `examples/one_shot_schema_extraction_geothermal/geothermal_one_shot_guide.md` +- `examples/one_shot_schema_extraction_cst/cst_config.json5` (CST reference) +- `examples/one_shot_schema_extraction_cst/cst_plugin_config.yaml` (CST reference) +- `examples/one_shot_schema_extraction_cst/cst_schema.json` (CST reference) +- `compass/extraction/geothermal_electricity/` (finalized one-shot tech example) +- `docs/source/examples/one_shot_schema_extraction/plugin_config_minimal.json` +- `docs/source/examples/one_shot_schema_extraction/plugin_config.yaml` +- `examples/compass_tech_pipeline/README.md` + +## Environment setup reminder + +Before running, load secrets from `.env` (for example `SERPAPI_KEY`, +`AZURE_OPENAI_API_KEY`) into the current shell. Do not commit secret values +inside config files. + +Common `.env` gotcha: avoid spaces around `=` in variable assignments. + +## Core command + +```bash +pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v +``` + +## Phase-gated workflow + +1. **Smoke test (3 jurisdictions)** + - Goal: verify wiring and output contract. +2. **Robustness (10-25 jurisdictions)** + - Goal: verify feature stability and edge-case handling. +3. **Scale (full set)** + - Goal: only after earlier phases pass acceptance gates. + +## Validation checklist + +Evaluate each run on: + +- document relevance (exclude off-domain content), +- feature coverage vs expected ordinance topics, +- section/summary traceability, +- unit consistency, +- null discipline, +- **scope bleed** — check that no features appear in the output CSVs that + fall outside the schema enum; generic land-use-code documents can cause + unrelated provisions to leak through. Tighten `extraction_system_prompt` + in plugin YAML to fix this. + +## Expected output artifacts + +A successful run produces these files under `out_dir`: + +| Artifact | Meaning | +|---|---| +| `ordinance_files/*.pdf` | Downloaded source documents | +| `cleaned_text/*.txt` | Heuristic-filtered extracted text | +| `jurisdiction_dbs/*.csv` | Per-jurisdiction raw extraction rows | +| `quantitative_ordinances.csv` | Final compiled numeric features | +| `qualitative_ordinances.csv` | Final compiled qualitative features | +| `usage.json` | Per-jurisdiction LLM token and request counts | +| `meta.json` | Run metadata (cost, timing, version) | + +Final CSV columns: `county`, `state`, `subdivision`, `jurisdiction_type`, +`FIPS`, `feature`, `value`, `units`, `adder`, `min_dist`, `max_dist`, +`summary`, `year`, `section`, `source`. + +## Interpreting output status correctly + +`cleaned_text` files can exist while `Number of documents found` is `0`. + +This means acquisition/text collection worked, but no final structured ordinance +rows were emitted into consolidated DB outputs. + +Check in order: + +1. `outputs/*/cleaned_text/*.txt` (text extraction present) +2. `outputs/*/jurisdiction_dbs/*.csv` (per-jurisdiction parsed rows) +3. `outputs/*/quantitative_ordinances.csv` and + `outputs/*/qualitative_ordinances.csv` (final compiled results) + +## Root-cause triage + +- **Wrong or noisy documents** + - Tune query templates, URL keywords, and exclusions. + - Prefer `known_doc_urls` while stabilizing. +- **Right documents, wrong fields** + - Tune schema descriptions/examples and ambiguity rules. + - Check `extraction_system_prompt` in plugin YAML — it is the primary + guard against scope bleed from generic legal documents. +- **Correct values, unstable formatting** + - Tighten enums, unit vocabulary, and null behavior. +- **Nothing downloaded / unstable search** + - Disable live search and use deterministic known URLs/local docs. +- **0 documents found for a jurisdiction during website crawl** + - Expected for jurisdictions with few online ordinances. The website + crawl is a second acquisition pass after search-engine retrieval; + 0 results there is not a pipeline failure. + +## Acceptance gates + +Do not advance phases until all are true: + +- Output rows conform to required contract. +- High share of rows include useful `section` and `summary`. +- Feature names are stable and machine-consistent. +- Repeated runs on same sample show minimal drift. + +## Cost and speed controls + +- Keep sample size minimal while tuning. +- Change one variable per run. +- Archive run command, input set, and output path for each iteration. + +## Workspace hygiene (important) + +Keep one canonical working set per technology in `examples/`: + +- one run config, +- one plugin config, +- one schema, +- one jurisdiction file, +- one known docs file. + +Delete stale `_migrated`, `_smoke`, and duplicate output folders to avoid +configuration drift and debugging confusion. + +## Known infrastructure issues + +### Playwright timeouts + +Web search via `rebrowser_playwright` may fail with 60s timeouts on +`Page.wait_for_selector`. Symptoms: +- `TimeoutError: Page.wait_for_selector: Timeout 60000ms exceeded` +- All search queries fail consistently +- Browser session crashes with `ProtocolError: Internal server error, session closed` + +These errors during the **website crawl phase** (second acquisition pass) are +**non-fatal**. COMPASS logs them and continues. They do not block the +search-engine phase or extraction. + +If search itself is failing, verify provider credentials are loaded and fall +back to deterministic mode. + +**Workaround**: Use `known_local_docs` or `known_doc_urls` and disable +search/website steps while validating extraction logic. + +### known_local_docs loading failures + +`known_local_docs` may fail silently with `ERROR: Failed to read file` in +jurisdiction logs due to external loader behavior. + +**Workaround**: Prefer `known_doc_urls` for deterministic smoke tests and +pre-validate local docs before pipeline runs. + diff --git a/.github/skills/schema-creation/SKILL.md b/.github/skills/schema-creation/SKILL.md new file mode 100644 index 00000000..951593a3 --- /dev/null +++ b/.github/skills/schema-creation/SKILL.md @@ -0,0 +1,173 @@ +--- +name: schema-creation +description: Author and iterate one-shot extraction schemas that replace legacy decision-tree extraction logic in native COMPASS. +--- + +# Schema Creation Skill + +Use this skill to encode extraction logic in schema so behavior is repeatable +across jurisdictions and technologies. + +## When to use + +- Creating a new one-shot technology plugin. +- Migrating from decision-tree logic to schema-driven extraction. +- Stabilizing inconsistent model outputs. + +## Example references (optional) + +- `examples/one_shot_schema_extraction_geothermal/geothermal_schema.json` +- `examples/one_shot_schema_extraction_geothermal/README.rst` +- `examples/one_shot_schema_extraction_geothermal/geothermal_one_shot_guide.md` +- `docs/source/examples/one_shot_schema_extraction/wind_schema.json` + +## Required output contract + +Top-level object must define `outputs` and each item must require: + +- `feature` +- `value` +- `units` +- `section` +- `summary` + +```json +{ + "type": "object", + "required": ["outputs"], + "properties": { + "outputs": { + "type": "array", + "items": { + "type": "object", + "required": ["feature", "value", "units", "section", "summary"], + "additionalProperties": false + } + } + } +} +``` + +## Build sequence + +1. Copy baseline schema and rename for target tech. +2. Replace `feature` enum with target-tech IDs. +3. Define `value`/`units` rules per feature family. +4. Add `$definitions` for reusable decision logic. +5. Add `$examples` for top failure modes. +6. Add `$instructions` for global extraction policy. + +For new technologies (for example CHP or CST), clone a working schema and +perform a strict vocabulary swap (features, units, exclusions) before adding +new logic. + +## Output column mapping + +Schema field names map directly to the final output CSV columns: + +| Schema field | CSV column | +|---|---| +| `feature` | `feature` | +| `value` | `value` | +| `units` | `units` | +| `section` | `section` | +| `summary` | `summary` | + +Additional columns added by COMPASS finalization: `county`, `state`, +`subdivision`, `jurisdiction_type`, `FIPS`, `adder`, `min_dist`, `max_dist`, +`year`, `source`. These do not need to appear in the schema. + +## Scope bleed from generic legal documents + +When COMPASS retrieves a large generic land-use code rather than a +technology-specific ordinance, the LLM may extract provisions that are +outside the schema enum. This is most visible when unfamiliar feature names +appear in the output CSV. + +Primary controls: +- `extraction_system_prompt` in plugin YAML — this is the strongest signal. + State explicitly what is in scope and what is out. +- `$instructions.scope` in schema — reinforce exclusion language here. +- `heuristic_keywords.not_tech_words` — filter documents upstream. + +Do not widen the feature enum to accommodate scope bleed; narrow the prompt +and upstream filters instead. + +## Technology adaptation guidance + +When adapting a baseline schema to any new technology: + +- Separate core utility-scale requirements from adjacent/non-target systems. +- Keep district/permit features distinct from numerical constraints. +- Encode jurisdiction/governance handling where relevant in summaries. +- Require explicit nulls when a feature is not enacted. + +## Cross-technology adaptation checklist + +Apply this for any new domain: + +1. Define technology-specific `feature` enum with stable IDs. +2. Define allowed unit vocabulary for each feature family. +3. Add explicit exclusion language for adjacent-but-out-of-scope systems. +4. Ensure summaries preserve legal traceability (section + source-faithful text). +5. Validate on deterministic docs before tuning retrieval. +6. Consider including `enactment date` in the enum — COMPASS naturally surfaces it + from documents and it provides important temporal context in outputs. + +## Example specialization patterns (optional) + +Use examples only to shape exclusion strategy: + +- separate core utility-scale requirements from adjacent technologies, +- add explicit exclusion terms in `not_tech_words`, +- preserve legal traceability via `section` and `summary`. + +## Reuse safeguards + +- Keep tech-first file names consistent across assets: + `_config*.json5`, `_plugin_config.yaml`, + `_schema.json`, `_jurisdictions*.csv`. +- Keep credentials out of schema content and examples. +- Validate schema behavior with a small smoke run before scaling. + +## High-value authoring patterns + +- Put restrictive-value selection rules directly in descriptions. +- Explicitly define accepted unit vocabulary. +- Clarify near-miss terms that should not be treated as equivalent. +- State whether qualitative features should keep `value`/`units` null. + +## Anti-patterns + +- Retrieval instructions embedded in schema semantics. +- Feature IDs that change names across iterations. +- Implicit unit assumptions not declared in text. +- Examples that contradict field descriptions. +- Feature enums that include placeholders with no extraction logic. + +## Quality checklist + +- Enum matches target output columns. +- Every feature has deterministic extraction rules. +- `section` and `summary` preserve legal traceability. +- Repeated sample runs produce stable feature typing. + +## Iteration loop + +1. Run 3-jurisdiction smoke sample. +2. Catalog failure modes by feature. +3. Patch only affected descriptions/examples. +4. Re-run same sample before expanding scope. + +Save iterated schema versions as `_schemav2.json`, `_schemav3.json` +etc. to preserve a diff history. The active version is what `schema:` in the +plugin YAML points to. + +## Practical quality signal + +Treat a schema as "working" when all are true on the smoke sample: + +- final ordinance CSV outputs are non-empty, +- extracted rows include stable feature IDs, +- most non-null rows have useful `section` and `summary`, +- repeated runs do not shift feature semantics materially. diff --git a/.github/skills/web-scraper/SKILL.md b/.github/skills/web-scraper/SKILL.md new file mode 100644 index 00000000..f021bdb8 --- /dev/null +++ b/.github/skills/web-scraper/SKILL.md @@ -0,0 +1,134 @@ +--- +name: web-scraper +description: Build and tune one-shot plugin configs that search, rank, and collect ordinance documents with native COMPASS pipeline settings. +--- + +# Web Scraper Skill + +Use this skill to improve retrieval precision/recall before extraction tuning. + +## When to use + +- Download step returns noisy sources. +- Ordinance recall is weak across jurisdictions. +- LLM filtering is compensating for poor search quality. + +## Scope + +- Query-template strategy. +- URL ranking and filtering patterns. +- Heuristic phrase controls before LLM validation. + +## Example references (optional) + +- `examples/one_shot_schema_extraction_geothermal/geothermal_plugin_config.yaml` +- `examples/one_shot_schema_extraction_geothermal/geothermal_config.json5` +- `examples/one_shot_schema_extraction_geothermal/geothermal_jurisdictions_one.csv` +- `examples/one_shot_schema_extraction_cst/cst_plugin_config.yaml` +- `examples/compass_tech_pipeline/README.md` + +## Two retrieval phases + +COMPASS runs two sequential acquisition passes per jurisdiction: + +1. **Search-engine phase** — queries `SerpAPIGoogleSearch` (or configured + engine) using `query_templates`. This phase is the primary source of + ordinance documents. +2. **Website crawl phase** — crawls the jurisdiction's official website, + ranking pages using `website_keywords`. This phase is a secondary pass + and runs even if the SE phase found documents. + +Key behaviors: +- Playwright browser errors during the website crawl phase are **non-fatal**. + COMPASS logs the error and continues. +- `Found 0 potential documents` at the end of the crawl phase is **expected** + for jurisdictions without relevant online ordinances. +- Disable the crawl phase with `perform_website_search: false` in run config + when you want faster smoke tests or Playwright is unavailable. + +## Key management + +For SerpAPI-backed search, keep `api_key` out of committed config and provide +`SERPAPI_KEY` via environment (for example through `.env` loaded in shell). + +Recommended shell setup: + +```bash +set -a +source .env +set +a +``` + +Avoid spaces around `=` in `.env` assignments. + +## Retrieval design pattern + +1. Create 3-7 jurisdiction queries with `{jurisdiction}`. +2. Weight legal document indicators in URL keywords. +3. Apply exclusions for templates/reports/slides. +4. Add focused negative tech terms to reduce false positives. +5. Start with dynamic search, then switch to deterministic known URLs when + search infrastructure is unstable. + +For first-pass reliability, test retrieval with deterministic known URLs +before using live web search. + +## Technology-specific retrieval controls (template) + +- Include target-technology facility/deployment terms. +- Exclude adjacent and non-target terms (residential/HVAC/PV/etc as needed). +- Favor jurisdictional legal-code signals like `land use code`, + `code of ordinances`, `use table`, and `special use permit`. + +## Deterministic smoke-test mode + +Use run-config controls to bypass flaky search while tuning: + +- supply `known_doc_urls` or `known_local_docs`, +- set `perform_se_search: false`, +- set `perform_website_search: false`. + +Then validate: + +- download artifacts exist, +- cleaned text exists, +- ordinance DB rows are non-empty. + +## Tuning loop + +1. Run SE-search phase on small sample. +2. Inspect kept vs discarded PDFs (`ordinance_files/`). +3. Run heuristic filter and review false rejects/accepts (`cleaned_text/`). +4. Check website crawl phase independently if needed (enable, run, inspect logs). +5. Update one axis only: + - query templates (affects SE phase), + - URL weights (affects both phases), + - include/exclude heuristic patterns (pre-LLM filter), + - `not_tech_words` (upstream document rejection). +6. Re-run same sample and compare. + +## Cross-tech onboarding + +When reusing this workflow for any technology: + +- keep legal retrieval tokens (`ordinance`, `zoning`, `code`), +- replace all technology terms in `query_templates`, `website_keywords`, + and `heuristic_keywords`, +- seed `known_doc_urls` with authoritative regulatory documents for smoke + testing, +- avoid copying negatives from previous technologies into the new tech config, +- verify `not_tech_words` excludes adjacent technologies for your domain. + +## Phase gates + +- **3 jurisdictions**: ensure major source classes are found. +- **10-25 jurisdictions**: verify stability across regions. +- **Full scale**: only once false positive/negative rates stabilize. + +## Guardrails + +- Keep feature extraction logic out of retrieval config. +- Do not overfit to one county's document style. +- Preserve auditable rationale for each retrieval change. +- Keep one canonical retrieval config per active technology. +- Ensure each run uses a unique `out_dir` to avoid COMPASS aborting early. diff --git a/.github/skills/yaml-setup/SKILL.md b/.github/skills/yaml-setup/SKILL.md new file mode 100644 index 00000000..a9f93d17 --- /dev/null +++ b/.github/skills/yaml-setup/SKILL.md @@ -0,0 +1,199 @@ +--- +name: yaml-setup +description: Author and tune one-shot plugin YAML configs for COMPASS-native document discovery, filtering, and text collection. +--- + +# YAML Setup Skill + +Use this skill to create or tune one-shot plugin YAML that controls retrieval, +filtering, and text collection behavior. + +## When to use + +- New technology onboarding in one-shot extraction. +- Schema exists but source relevance is weak. +- You need reproducible config handoff across teams. + +## Example references (optional) + +- `examples/one_shot_schema_extraction_geothermal/geothermal_plugin_config.yaml` +- `examples/one_shot_schema_extraction_geothermal/README.rst` +- `examples/one_shot_schema_extraction_geothermal/geothermal_one_shot_guide.md` +- `docs/source/examples/one_shot_schema_extraction/plugin_config_minimal.json` +- `docs/source/examples/one_shot_schema_extraction/plugin_config_simple.json5` +- `docs/source/examples/one_shot_schema_extraction/plugin_config.yaml` + +## Naming convention + +Use tech-first file names when creating new one-shot assets: +`_config*.json5`, `_plugin_config.yaml`, +`_schema.json`, `_jurisdictions*.csv`. + +## Secret handling + +Keep API keys in environment variables (for example `SERPAPI_KEY`, +`AZURE_OPENAI_API_KEY`) rather than in plugin or run config files. +Load them per shell session with `set -a && source .env && set +a`. +Avoid spaces around `=` in `.env` assignments. + +## Required minimum + +```yaml +schema: ./my_schema.json +``` + +## Key plugin YAML fields + +| Field | Type | Behavior | +|---|---|---| +| `schema` | string (path) | **Required.** Path to JSON schema file, relative to plugin YAML location. | +| `data_type_short_desc` | string | Short description used in LLM prompts (e.g. `utility-scale ordinance`). | +| `query_templates` | list | Search query templates; `{jurisdiction}` is replaced at runtime. | +| `website_keywords` | dict | Keyword → score map for URL ranking during website crawl. | +| `heuristic_keywords` | dict or `true` | Pre-LLM text filter. If `true`, LLM generates lists from schema. | +| `collection_prompts` | list or `true` | Text collection prompt(s). If **`true`**, LLM auto-generates from schema. | +| `text_extraction_prompts` | list or `true` | Text consolidation prompt(s). If **`true`**, LLM auto-generates from schema. | +| `extraction_system_prompt` | string | Overrides default LLM system prompt for the extraction step. Use this to scope extraction tightly to the target technology. | +| `cache_llm_generated_content` | bool | Cache LLM-generated `query_templates` and `website_keywords`. Set to `false` when iterating schema to see live changes. | + +### `collection_prompts: true` and `text_extraction_prompts: true` + +Setting either flag to `true` (not a list) instructs COMPASS to use the LLM +to auto-generate the prompts from the schema content. This is the recommended +shortcut during development — do not write manual prompt lists until +auto-generated ones prove insufficient. + +### `extraction_system_prompt` + +This is the primary control for preventing scope bleed from generic land-use +code documents. Write it as a multi-line YAML literal block: + +```yaml +extraction_system_prompt: |- + You are a legal scholar extracting structured data from + utility-scale ordinances. + + Extract only enacted requirements for utility-scale facilities. + Exclude adjacent technologies and non-target use cases. + Prefer explicit values. Use null for qualitative obligations. +``` + +See `compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml` +for a complete example. + +## Progressive config path + +1. **Minimal** + - Confirm schema path and extraction invocation work. +2. **Simple** + - Add `query_templates`, `heuristic_keywords`, and `cache_llm_generated_content`. + - Set `collection_prompts: true` and `text_extraction_prompts: true` to + let the LLM auto-generate prompts from the schema. +3. **Full** + - Add `extraction_system_prompt` if scope bleed or off-domain extraction + is observed. + - Replace `heuristic_keywords: true` with an explicit list if precision + is insufficient. + +Use the same progression for any technology. + +## Baseline YAML pattern + +```yaml +schema: ./my_schema.json +data_type_short_desc: utility-scale ordinance +cache_llm_generated_content: true +query_templates: + - "filetype:pdf {jurisdiction} ordinance" + - "{jurisdiction} zoning ordinance" + - "{jurisdiction} permitting requirements" +website_keywords: + pdf: 92160 + : 46080 + ordinance: 23040 + zoning: 2880 + permit: 1440 +heuristic_keywords: + good_tech_keywords: + - "" + - "" + good_tech_acronyms: + - "" + good_tech_phrases: + - "" + - "" + not_tech_words: + - "" + - "" +collection_prompts: true +text_extraction_prompts: true +extraction_system_prompt: |- + You are a legal scholar extracting structured data from + utility-scale ordinances. + + Extract only requirements for utility-scale facilities. + Exclude adjacent technologies and non-target use cases. +``` + +Swap vocabulary for any technology while keeping the same structure. + +## Stable development mode + +Plugin YAML controls retrieval behavior, but deterministic acquisition for +smoke tests belongs in run config: + +- `known_doc_urls` or `known_local_docs` +- `perform_se_search: false` +- `perform_website_search: false` (disables the website crawl second phase) + +Use this mode first, then re-enable search once schema extraction quality is +stable. + +Recommended baseline: use dynamic search first, then use deterministic mode +if search infrastructure fails. + +## Acquisition phases + +COMPASS acquisition runs in two sequential phases per jurisdiction: + +1. **Search-engine phase** — uses `SerpAPIGoogleSearch` or similar; driven by + `query_templates`. +2. **Website crawl phase** — crawls the jurisdiction's main website using + `website_keywords` for ranking. Playwright browser errors during this + phase are **non-fatal**; COMPASS logs them and moves on. + +`perform_website_search: false` skips phase 2. Use it during smoke tests to +keep run time short and avoid Playwright dependency issues. + +## Validation checklist + +- Schema path resolves from runtime working directory. +- Query templates include `{jurisdiction}` consistently. +- URL weights favor legal and government documents. +- Heuristic exclusions are precise and not over-broad. +- Prompt overrides are only added when default behavior fails. + +## Cross-tech adaptation checklist + +When adapting to another technology: + +- replace vocabulary in `query_templates` and `website_keywords`, +- keep legal-code terms (`ordinance`, `zoning`, `code of ordinances`), +- keep non-target exclusions explicit in `not_tech_words`, +- do not carry terms from a previous technology into new tech configs, +- write a technology-specific `extraction_system_prompt`. + +## Run command + +```bash +pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v +``` + +If running outside the tech folder, use absolute paths for `-c` and `-p`. + +## Guardrails + +- Retrieval behavior belongs in plugin YAML. +- Feature logic belongs in schema. +- Adjust one tuning axis per run for clean attribution. +- Keep one canonical plugin file per technology in the active example folder. From 54b8d290a082bbf18310c84efa3c97df61cadc18 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:13:14 -0600 Subject: [PATCH 03/21] Added one-shot skills --- .github/skills/extraction-run/SKILL.md | 57 +++--- .github/skills/schema-creation/SKILL.md | 228 ++++++++++++------------ .github/skills/web-scraper/SKILL.md | 29 +-- .github/skills/yaml-setup/SKILL.md | 107 +++++++---- 4 files changed, 241 insertions(+), 180 deletions(-) diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md index 23321b46..e77a10cc 100644 --- a/.github/skills/extraction-run/SKILL.md +++ b/.github/skills/extraction-run/SKILL.md @@ -5,14 +5,19 @@ description: Execute one-shot extraction with COMPASS, evaluate outputs, and ite # Extraction Run Skill +**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. +For legacy decision-tree extraction (solar, wind, small wind), consult COMPASS +architecture docs. + Use this skill to run one-shot extraction in a repeatable, low-risk way, then iterate quickly until you have stable structured outputs. ## When to use - Schema exists and plugin config points to it. -- You are onboarding a new technology (for example geothermal, CHP, hydrogen). +- You are onboarding a new technology (diesel generator, geothermal, CHP, hydrogen). - You need a reliable smoke-test workflow before scaling. +- You are NOT using legacy decision-tree extraction. ## Two-pipeline modes @@ -48,6 +53,16 @@ needed for one-shot techs. - API keys in environment (never hardcode in configs). - A jurisdiction set sized to the current phase. +## Preflight checks (must pass before run) + +- Jurisdiction CSV has headers `County,State`. +- `out_dir` is unique for this run. +- At least one acquisition step is enabled: + `perform_se_search: true`, `perform_website_search: true`, + `known_doc_urls`, or `known_local_docs`. +- If `heuristic_keywords` exists, all four required lists are present and + non-empty. + ## Naming convention Use tech-first names for all one-shot assets: @@ -92,29 +107,19 @@ unchanged and swap only technology-specific inputs: Change one axis per run unless debugging infrastructure failures. -## Example references (optional) +## Canonical reference -- `examples/one_shot_schema_extraction/README.rst` -- `examples/one_shot_schema_extraction_geothermal/geothermal_config.json5` -- `examples/one_shot_schema_extraction_geothermal/geothermal_plugin_config.yaml` -- `examples/one_shot_schema_extraction_geothermal/geothermal_schema.json` -- `examples/one_shot_schema_extraction_geothermal/geothermal_jurisdictions_one.csv` -- `examples/one_shot_schema_extraction_geothermal/geothermal_one_shot_guide.md` -- `examples/one_shot_schema_extraction_cst/cst_config.json5` (CST reference) -- `examples/one_shot_schema_extraction_cst/cst_plugin_config.yaml` (CST reference) -- `examples/one_shot_schema_extraction_cst/cst_schema.json` (CST reference) -- `compass/extraction/geothermal_electricity/` (finalized one-shot tech example) -- `docs/source/examples/one_shot_schema_extraction/plugin_config_minimal.json` -- `docs/source/examples/one_shot_schema_extraction/plugin_config.yaml` -- `examples/compass_tech_pipeline/README.md` +- `examples/one_shot_schema_extraction/` — complete working examples +- `examples/one_shot_schema_extraction/README.rst` — general one-shot overview +- `examples/water_rights_demo/one-shot/` — multi-doc extraction example -## Environment setup reminder +## Environment setup -Before running, load secrets from `.env` (for example `SERPAPI_KEY`, -`AZURE_OPENAI_API_KEY`) into the current shell. Do not commit secret values -inside config files. +Load secrets from `.env` before running. Never commit key values in config files. -Common `.env` gotcha: avoid spaces around `=` in variable assignments. +```bash +set -a && source .env && set +a # no spaces around = in .env assignments +``` ## Core command @@ -124,9 +129,9 @@ pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v ## Phase-gated workflow -1. **Smoke test (3 jurisdictions)** +1. **Smoke test (1 jurisdiction)** - Goal: verify wiring and output contract. -2. **Robustness (10-25 jurisdictions)** +2. **Robustness (5 jurisdictions)** - Goal: verify feature stability and edge-case handling. 3. **Scale (full set)** - Goal: only after earlier phases pass acceptance gates. @@ -177,6 +182,14 @@ Check in order: 3. `outputs/*/quantitative_ordinances.csv` and `outputs/*/qualitative_ordinances.csv` (final compiled results) +Treat the run as **failed for extraction quality** when either is true: +- `Number of jurisdictions with extracted data: 0` +- any configuration exception appears in logs (even if process exits 0) + +Only treat a run as passing when both are true: +- at least one jurisdiction has extracted data +- at least one jurisdiction CSV in `jurisdiction_dbs/` has more than header row + ## Root-cause triage - **Wrong or noisy documents** diff --git a/.github/skills/schema-creation/SKILL.md b/.github/skills/schema-creation/SKILL.md index 951593a3..805d8dc9 100644 --- a/.github/skills/schema-creation/SKILL.md +++ b/.github/skills/schema-creation/SKILL.md @@ -5,169 +5,161 @@ description: Author and iterate one-shot extraction schemas that replace legacy # Schema Creation Skill -Use this skill to encode extraction logic in schema so behavior is repeatable -across jurisdictions and technologies. +**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction +(new technology onboarding with JSON schema + plugin YAML). For legacy decision-tree +extraction (existing solar/wind/small-wind in `compass/extraction//`), +consult COMPASS architecture docs. + +Use this skill to define what the LLM extracts and how it formats results. +The schema is the single most important config file for output quality. ## When to use -- Creating a new one-shot technology plugin. -- Migrating from decision-tree logic to schema-driven extraction. -- Stabilizing inconsistent model outputs. +- Starting a new one-shot technology extraction (NOT decision-tree legacy extraction). +- Fixing inconsistent or incorrect extracted values in one-shot extraction. +- Adding new features to an existing one-shot extraction. -## Example references (optional) +## Canonical reference -- `examples/one_shot_schema_extraction_geothermal/geothermal_schema.json` -- `examples/one_shot_schema_extraction_geothermal/README.rst` -- `examples/one_shot_schema_extraction_geothermal/geothermal_one_shot_guide.md` -- `docs/source/examples/one_shot_schema_extraction/wind_schema.json` +For complete examples, see the `examples/` directory: +- `examples/one_shot_schema_extraction/wind_schema.json` +- `examples/water_rights_demo/one-shot/water_rights_schema.json5` -## Required output contract +Each follows the pattern: `_schema.json` or `_schema.json5`. -Top-level object must define `outputs` and each item must require: +## Required output contract -- `feature` -- `value` -- `units` -- `section` -- `summary` +Every schema must define `outputs` as an array. Each item must require +exactly these five fields and set `additionalProperties: false`: ```json { "type": "object", "required": ["outputs"], + "additionalProperties": false, "properties": { "outputs": { "type": "array", "items": { "type": "object", "required": ["feature", "value", "units", "section", "summary"], - "additionalProperties": false + "additionalProperties": false, + "properties": { + "feature": { "type": "string", "enum": ["..."] }, + "value": { "anyOf": [{"type": "number"}, {"type": "string"}, {"type": "boolean"}, {"type": "array", "items": {"type": "string"}}, {"type": "null"}] }, + "units": { "type": ["string", "null"] }, + "section": { "type": ["string", "null"] }, + "summary": { "type": ["string", "null"] } + } } } } } ``` -## Build sequence - -1. Copy baseline schema and rename for target tech. -2. Replace `feature` enum with target-tech IDs. -3. Define `value`/`units` rules per feature family. -4. Add `$definitions` for reusable decision logic. -5. Add `$examples` for top failure modes. -6. Add `$instructions` for global extraction policy. - -For new technologies (for example CHP or CST), clone a working schema and -perform a strict vocabulary swap (features, units, exclusions) before adding -new logic. - -## Output column mapping +These five fields map directly to the output CSV columns. COMPASS adds +`county`, `state`, `FIPS`, and other metadata columns automatically. -Schema field names map directly to the final output CSV columns: - -| Schema field | CSV column | -|---|---| -| `feature` | `feature` | -| `value` | `value` | -| `units` | `units` | -| `section` | `section` | -| `summary` | `summary` | - -Additional columns added by COMPASS finalization: `county`, `state`, -`subdivision`, `jurisdiction_type`, `FIPS`, `adder`, `min_dist`, `max_dist`, -`year`, `source`. These do not need to appear in the schema. +## Build sequence -## Scope bleed from generic legal documents +1. **Define the feature enum** — one stable lowercase ID per siting-relevant + requirement. Group IDs by family (setbacks, noise, zoning, permitting). +2. **Define `value` and `units` rules per feature family** — in each + feature's `description`, state the expected value type and accepted unit + vocabulary explicitly. +3. **Add `$definitions`** — group related feature descriptions here to keep + the `feature` enum block clean. +4. **Add `$instructions`** — encode global extraction policy (scope, null + handling, one-row-per-feature contract, verbatim quote preference). +5. **Smoke-test on one jurisdiction** — validate all enum items appear in + output and null rows are correctly populated for missing features. -When COMPASS retrieves a large generic land-use code rather than a -technology-specific ordinance, the LLM may extract provisions that are -outside the schema enum. This is most visible when unfamiliar feature names -appear in the output CSV. +## Feature definition template -Primary controls: -- `extraction_system_prompt` in plugin YAML — this is the strongest signal. - State explicitly what is in scope and what is out. -- `$instructions.scope` in schema — reinforce exclusion language here. -- `heuristic_keywords.not_tech_words` — filter documents upstream. +Every feature description must answer four questions: -Do not widen the feature enum to accommodate scope bleed; narrow the prompt -and upstream filters instead. +1. **What is this?** One sentence identifying the regulatory concept. +2. **VALUE rule:** What type is the value and what specific values/ranges are + valid? +3. **UNITS rule:** What unit string is accepted, or `null` if not applicable? +4. **IGNORE / CLARIFICATION:** What near-miss concepts must NOT match this + feature? -## Technology adaptation guidance +Example (abbreviated): -When adapting a baseline schema to any new technology: +```json +"structure setback": { + "description": "Minimum distance from the generator to an occupied building. VALUE: numerical distance. UNITS: 'feet' or 'meters'. IGNORE: setbacks from property lines or roads — those are separate features." +} +``` -- Separate core utility-scale requirements from adjacent/non-target systems. -- Keep district/permit features distinct from numerical constraints. -- Encode jurisdiction/governance handling where relevant in summaries. -- Require explicit nulls when a feature is not enacted. +## Feature family taxonomy -## Cross-technology adaptation checklist +Organize `$definitions` by these families: -Apply this for any new domain: +| Family | Example features | +|---|---| +| Setbacks | `structure setback`, `property line setback`, `road setback` | +| Noise/Emissions | `noise limit`, `emissions standard`, `vibration limit` | +| Operational | `hours of operation` | +| Physical design | `screening requirement`, `enclosure requirement`, `exhaust stack height` | +| Zoning | `primary use districts`, `conditional use districts`, `prohibited use districts` | +| Permitting | `permit requirement`, `capacity threshold` | +| Compliance | `decommissioning`, `enactment date` | -1. Define technology-specific `feature` enum with stable IDs. -2. Define allowed unit vocabulary for each feature family. -3. Add explicit exclusion language for adjacent-but-out-of-scope systems. -4. Ensure summaries preserve legal traceability (section + source-faithful text). -5. Validate on deterministic docs before tuning retrieval. -6. Consider including `enactment date` in the enum — COMPASS naturally surfaces it - from documents and it provides important temporal context in outputs. +## `$instructions` block -## Example specialization patterns (optional) +Always include a `$instructions` object at the top level with these keys: -Use examples only to shape exclusion strategy: +```json +"$instructions": { + "scope": "Describe exactly what to extract and what to ignore.", + "null_handling": "Output every enum feature. Use null value and null summary when a feature is not found in the document. Do not omit features.", + "one_row_per_feature": "Output exactly one row per feature. If multiple values apply, use the most restrictive and describe variants in summary.", + "verbatim_quotes": "In summary fields, prefer verbatim quotes from the source. Enclose in double quotation marks.", + "units_discipline": "Do not convert units. Record them exactly as they appear in the document." +} +``` -- separate core utility-scale requirements from adjacent technologies, -- add explicit exclusion terms in `not_tech_words`, -- preserve legal traceability via `section` and `summary`. +## Scope bleed control -## Reuse safeguards +When COMPASS retrieves a large land-use code instead of a tech-specific +ordinance, the LLM may extract off-domain provisions. -- Keep tech-first file names consistent across assets: - `_config*.json5`, `_plugin_config.yaml`, - `_schema.json`, `_jurisdictions*.csv`. -- Keep credentials out of schema content and examples. -- Validate schema behavior with a small smoke run before scaling. +Fix order (most powerful first): +1. `extraction_system_prompt` in plugin YAML — state explicitly what is in + scope and what is excluded. +2. `$instructions.scope` in schema — reinforce with exclusion language. +3. `heuristic_keywords.NOT_TECH_WORDS` — reject documents upstream. -## High-value authoring patterns +Do not expand the feature enum to absorb scope bleed. Narrow the prompt. -- Put restrictive-value selection rules directly in descriptions. -- Explicitly define accepted unit vocabulary. -- Clarify near-miss terms that should not be treated as equivalent. -- State whether qualitative features should keep `value`/`units` null. +## Cross-technology adaptation checklist -## Anti-patterns +When cloning this schema for a new technology: -- Retrieval instructions embedded in schema semantics. -- Feature IDs that change names across iterations. -- Implicit unit assumptions not declared in text. -- Examples that contradict field descriptions. -- Feature enums that include placeholders with no extraction logic. +- [ ] Replace all feature IDs with technology-specific names. +- [ ] Replace value/units rules in every feature description. +- [ ] Replace exclusion terms in `$instructions.scope` and feature IGNORE + clauses. +- [ ] Replace `$definitions` group names to match new feature families. +- [ ] Smoke-test before widening to 10+ jurisdictions. ## Quality checklist -- Enum matches target output columns. -- Every feature has deterministic extraction rules. -- `section` and `summary` preserve legal traceability. -- Repeated sample runs produce stable feature typing. - -## Iteration loop - -1. Run 3-jurisdiction smoke sample. -2. Catalog failure modes by feature. -3. Patch only affected descriptions/examples. -4. Re-run same sample before expanding scope. - -Save iterated schema versions as `_schemav2.json`, `_schemav3.json` -etc. to preserve a diff history. The active version is what `schema:` in the -plugin YAML points to. - -## Practical quality signal - -Treat a schema as "working" when all are true on the smoke sample: - -- final ordinance CSV outputs are non-empty, -- extracted rows include stable feature IDs, -- most non-null rows have useful `section` and `summary`, -- repeated runs do not shift feature semantics materially. +- [ ] Feature enum uses stable, lowercase, underscore-separated IDs. +- [ ] Every feature description contains VALUE, UNITS, and IGNORE clauses. +- [ ] `$instructions` block is present with all five keys. +- [ ] `additionalProperties: false` is set on the top-level object and on + each item in the `outputs` array. +- [ ] Schema validates cleanly against a JSON Schema validator. +- [ ] A smoke run using this schema produces extracted rows (not just + successful process exit logs). + +## Anti-patterns to avoid + +- Feature IDs that change names between iterations. +- Implicit unit assumptions not stated in description text. +- Missing IGNORE clauses for common near-miss features. +- Examples in descriptions that contradict field rules. +- Widening the enum to absorb scope bleed instead of tightening the prompt. diff --git a/.github/skills/web-scraper/SKILL.md b/.github/skills/web-scraper/SKILL.md index f021bdb8..f5149364 100644 --- a/.github/skills/web-scraper/SKILL.md +++ b/.github/skills/web-scraper/SKILL.md @@ -6,11 +6,13 @@ description: Build and tune one-shot plugin configs that search, rank, and colle # Web Scraper Skill Use this skill to improve retrieval precision/recall before extraction tuning. +Applies to both one-shot (schema-driven) and legacy decision-tree extraction +pipelines. ## When to use -- Download step returns noisy sources. -- Ordinance recall is weak across jurisdictions. +- Download step returns noisy sources (one-shot extraction). +- Ordinance recall is weak across jurisdictions (one-shot extraction). - LLM filtering is compensating for poor search quality. ## Scope @@ -19,13 +21,11 @@ Use this skill to improve retrieval precision/recall before extraction tuning. - URL ranking and filtering patterns. - Heuristic phrase controls before LLM validation. -## Example references (optional) +## Canonical reference -- `examples/one_shot_schema_extraction_geothermal/geothermal_plugin_config.yaml` -- `examples/one_shot_schema_extraction_geothermal/geothermal_config.json5` -- `examples/one_shot_schema_extraction_geothermal/geothermal_jurisdictions_one.csv` -- `examples/one_shot_schema_extraction_cst/cst_plugin_config.yaml` -- `examples/compass_tech_pipeline/README.md` +Consult example plugin configurations in `examples/` following the tech-first naming pattern: +- `_plugin_config.yaml` — standard one-shot config +- See `examples/water_rights_demo/one-shot/plugin_config.yaml` for multi-document edge cases ## Two retrieval phases @@ -70,6 +70,15 @@ Avoid spaces around `=` in `.env` assignments. 5. Start with dynamic search, then switch to deterministic known URLs when search infrastructure is unstable. +When using `heuristic_keywords`, include all required lists: +- `GOOD_TECH_KEYWORDS` +- `GOOD_TECH_PHRASES` +- `GOOD_TECH_ACRONYMS` +- `NOT_TECH_WORDS` + +If any required list is missing or empty, COMPASS raises a plugin +configuration error and extraction quality should be treated as failed. + For first-pass reliability, test retrieval with deterministic known URLs before using live web search. @@ -104,7 +113,7 @@ Then validate: - query templates (affects SE phase), - URL weights (affects both phases), - include/exclude heuristic patterns (pre-LLM filter), - - `not_tech_words` (upstream document rejection). + - `NOT_TECH_WORDS` (upstream document rejection). 6. Re-run same sample and compare. ## Cross-tech onboarding @@ -117,7 +126,7 @@ When reusing this workflow for any technology: - seed `known_doc_urls` with authoritative regulatory documents for smoke testing, - avoid copying negatives from previous technologies into the new tech config, -- verify `not_tech_words` excludes adjacent technologies for your domain. +- verify `NOT_TECH_WORDS` excludes adjacent technologies for your domain. ## Phase gates diff --git a/.github/skills/yaml-setup/SKILL.md b/.github/skills/yaml-setup/SKILL.md index a9f93d17..11a360af 100644 --- a/.github/skills/yaml-setup/SKILL.md +++ b/.github/skills/yaml-setup/SKILL.md @@ -5,23 +5,25 @@ description: Author and tune one-shot plugin YAML configs for COMPASS-native doc # YAML Setup Skill +**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. +For legacy decision-tree extraction, consult COMPASS architecture docs. + Use this skill to create or tune one-shot plugin YAML that controls retrieval, filtering, and text collection behavior. ## When to use -- New technology onboarding in one-shot extraction. +- New technology onboarding in one-shot extraction (NOT decision-tree extraction). - Schema exists but source relevance is weak. - You need reproducible config handoff across teams. -## Example references (optional) +## Canonical reference + +With tech-first naming, configuration examples follow this pattern: +- `examples/one_shot_schema_extraction/_plugin_config.yaml` — standard working example +- `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-doc edge case -- `examples/one_shot_schema_extraction_geothermal/geothermal_plugin_config.yaml` -- `examples/one_shot_schema_extraction_geothermal/README.rst` -- `examples/one_shot_schema_extraction_geothermal/geothermal_one_shot_guide.md` -- `docs/source/examples/one_shot_schema_extraction/plugin_config_minimal.json` -- `docs/source/examples/one_shot_schema_extraction/plugin_config_simple.json5` -- `docs/source/examples/one_shot_schema_extraction/plugin_config.yaml` +Refer to any complete example in `examples/` that matches your retrieval goals. ## Naming convention @@ -42,6 +44,14 @@ Avoid spaces around `=` in `.env` assignments. schema: ./my_schema.json ``` +## Non-negotiable runtime constraints + +- Jurisdiction CSV headers are case-sensitive: use `County,State`. +- If `heuristic_keywords` is present, it must include all four lists and + none may be empty. +- A run is not considered passing if logs show config errors or if + extracted jurisdiction count is zero. + ## Key plugin YAML fields | Field | Type | Behavior | @@ -56,6 +66,26 @@ schema: ./my_schema.json | `extraction_system_prompt` | string | Overrides default LLM system prompt for the extraction step. Use this to scope extraction tightly to the target technology. | | `cache_llm_generated_content` | bool | Cache LLM-generated `query_templates` and `website_keywords`. Set to `false` when iterating schema to see live changes. | +## Required `heuristic_keywords` shape + +Use this exact structure when defining `heuristic_keywords`: + +```yaml +heuristic_keywords: + GOOD_TECH_KEYWORDS: + - "" + GOOD_TECH_PHRASES: + - "" + GOOD_TECH_ACRONYMS: + - "" + NOT_TECH_WORDS: + - "" +``` + +Notes: +- Keys are normalized, but using canonical key names reduces mistakes. +- All four lists are required and must be non-empty. + ### `collection_prompts: true` and `text_extraction_prompts: true` Setting either flag to `true` (not a list) instructs COMPASS to use the LLM @@ -87,11 +117,11 @@ for a complete example. - Confirm schema path and extraction invocation work. 2. **Simple** - Add `query_templates`, `heuristic_keywords`, and `cache_llm_generated_content`. - - Set `collection_prompts: true` and `text_extraction_prompts: true` to - let the LLM auto-generate prompts from the schema. 3. **Full** - Add `extraction_system_prompt` if scope bleed or off-domain extraction is observed. + - Set `collection_prompts: true` and `text_extraction_prompts: true` to + let the LLM auto-generate prompts from the schema. - Replace `heuristic_keywords: true` with an explicit list if precision is insufficient. @@ -114,44 +144,61 @@ website_keywords: zoning: 2880 permit: 1440 heuristic_keywords: - good_tech_keywords: + GOOD_TECH_KEYWORDS: - "" - "" - good_tech_acronyms: + GOOD_TECH_ACRONYMS: - "" - good_tech_phrases: + GOOD_TECH_PHRASES: - "" - "" - not_tech_words: + NOT_TECH_WORDS: - "" - "" -collection_prompts: true -text_extraction_prompts: true -extraction_system_prompt: |- - You are a legal scholar extracting structured data from - utility-scale ordinances. - - Extract only requirements for utility-scale facilities. - Exclude adjacent technologies and non-target use cases. ``` Swap vocabulary for any technology while keeping the same structure. ## Stable development mode -Plugin YAML controls retrieval behavior, but deterministic acquisition for -smoke tests belongs in run config: +Use run-config controls for deterministic smoke tests while iterating schema: -- `known_doc_urls` or `known_local_docs` -- `perform_se_search: false` -- `perform_website_search: false` (disables the website crawl second phase) +- `known_doc_urls` or `known_local_docs` — bypass live search +- `perform_se_search: false` — disable search-engine phase +- `perform_website_search: false` — disable website crawl phase -Use this mode first, then re-enable search once schema extraction quality is -stable. +Re-enable search only after extraction quality is stable on known documents. Recommended baseline: use dynamic search first, then use deterministic mode if search infrastructure fails. +## Minimal run-config contract (to pair with plugin YAML) + +Use this pattern and require users to provide their own model and client +values: + +```json5 +{ + out_dir: "./outputs__", + tech: "", + jurisdiction_fp: "./_jurisdictions.csv", + perform_se_search: true, + perform_website_search: false, + model: [ + { + name: "", + llm_call_kwargs: { temperature: 0, timeout: 600 }, + client_kwargs: { + api_version: "", + azure_endpoint: "" + } + } + ] +} +``` + +Do not hardcode model names in skills. Prompt the user to supply `name`. + ## Acquisition phases COMPASS acquisition runs in two sequential phases per jurisdiction: @@ -179,7 +226,7 @@ When adapting to another technology: - replace vocabulary in `query_templates` and `website_keywords`, - keep legal-code terms (`ordinance`, `zoning`, `code of ordinances`), -- keep non-target exclusions explicit in `not_tech_words`, +- keep non-target exclusions explicit in `NOT_TECH_WORDS`, - do not carry terms from a previous technology into new tech configs, - write a technology-specific `extraction_system_prompt`. From a71447f1dbf7197cf651538d720f88faaf73a7e8 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 17 Mar 2026 13:32:28 -0600 Subject: [PATCH 04/21] update one-shot SKILL.md structure and trigger contracts --- .github/skills/extraction-run/SKILL.md | 28 ++++++++++++++++++------- .github/skills/schema-creation/SKILL.md | 20 +++++++++++++++--- .github/skills/web-scraper/SKILL.md | 24 ++++++++++++++++----- .github/skills/yaml-setup/SKILL.md | 17 ++++++++++++++- 4 files changed, 73 insertions(+), 16 deletions(-) diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md index e77a10cc..00be8f92 100644 --- a/.github/skills/extraction-run/SKILL.md +++ b/.github/skills/extraction-run/SKILL.md @@ -1,6 +1,6 @@ --- name: extraction-run -description: Execute one-shot extraction with COMPASS, evaluate outputs, and iterate schema/config changes with minimal cost. +description: Execute one-shot extraction with COMPASS and iterate quickly with low cost. Use whenever a user asks to run, smoke-test, validate, debug, or scale one-shot schema extraction for any technology. --- # Extraction Run Skill @@ -19,6 +19,26 @@ then iterate quickly until you have stable structured outputs. - You need a reliable smoke-test workflow before scaling. - You are NOT using legacy decision-tree extraction. +## Do not use + +- Legacy decision-tree extraction feature engineering. +- Python parser implementation in `compass/extraction//parse.py`. +- Non-extraction tasks (for example docs-only updates). + +## Expected assistant output + +When using this skill, return: + +1. The exact `pixi run compass process ...` command used. +2. A pass/fail decision against extraction-quality gates. +3. The smallest next config/schema change and why. + +## Canonical reference + +- `examples/one_shot_schema_extraction/` — complete working examples +- `examples/one_shot_schema_extraction/README.rst` — general one-shot overview +- `examples/water_rights_demo/one-shot/` — multi-doc extraction example + ## Two-pipeline modes COMPASS supports two distinct extraction pipelines. Choose one and do not mix @@ -107,12 +127,6 @@ unchanged and swap only technology-specific inputs: Change one axis per run unless debugging infrastructure failures. -## Canonical reference - -- `examples/one_shot_schema_extraction/` — complete working examples -- `examples/one_shot_schema_extraction/README.rst` — general one-shot overview -- `examples/water_rights_demo/one-shot/` — multi-doc extraction example - ## Environment setup Load secrets from `.env` before running. Never commit key values in config files. diff --git a/.github/skills/schema-creation/SKILL.md b/.github/skills/schema-creation/SKILL.md index 805d8dc9..c4941bc1 100644 --- a/.github/skills/schema-creation/SKILL.md +++ b/.github/skills/schema-creation/SKILL.md @@ -1,6 +1,6 @@ --- name: schema-creation -description: Author and iterate one-shot extraction schemas that replace legacy decision-tree extraction logic in native COMPASS. +description: Author and iterate one-shot extraction schemas for native COMPASS. Use whenever a user asks to create, expand, or debug schema feature definitions, value/unit rules, or extraction instructions. --- # Schema Creation Skill @@ -19,6 +19,19 @@ The schema is the single most important config file for output quality. - Fixing inconsistent or incorrect extracted values in one-shot extraction. - Adding new features to an existing one-shot extraction. +## Do not use + +- Retrieval tuning tasks that belong in plugin YAML. +- Legacy decision-tree extraction parser implementation. + +## Expected assistant output + +When using this skill, return: + +1. The proposed schema diff (or full schema block) for the targeted features. +2. The rationale for VALUE, UNITS, and IGNORE wording. +3. A smoke-test check plan for validating the schema change. + ## Canonical reference For complete examples, see the `examples/` directory: @@ -63,7 +76,8 @@ These five fields map directly to the output CSV columns. COMPASS adds ## Build sequence 1. **Define the feature enum** — one stable lowercase ID per siting-relevant - requirement. Group IDs by family (setbacks, noise, zoning, permitting). + requirement. Keep naming consistent across iterations and group IDs by + family (setbacks, noise, zoning, permitting). 2. **Define `value` and `units` rules per feature family** — in each feature's `description`, state the expected value type and accepted unit vocabulary explicitly. @@ -147,7 +161,7 @@ When cloning this schema for a new technology: ## Quality checklist -- [ ] Feature enum uses stable, lowercase, underscore-separated IDs. +- [ ] Feature enum uses stable, consistent IDs across all runs. - [ ] Every feature description contains VALUE, UNITS, and IGNORE clauses. - [ ] `$instructions` block is present with all five keys. - [ ] `additionalProperties: false` is set on the top-level object and on diff --git a/.github/skills/web-scraper/SKILL.md b/.github/skills/web-scraper/SKILL.md index f5149364..27a3fa37 100644 --- a/.github/skills/web-scraper/SKILL.md +++ b/.github/skills/web-scraper/SKILL.md @@ -1,6 +1,6 @@ --- name: web-scraper -description: Build and tune one-shot plugin configs that search, rank, and collect ordinance documents with native COMPASS pipeline settings. +description: Build and tune retrieval configs that search, rank, and collect ordinance documents in COMPASS. Use whenever a user asks to improve retrieval precision/recall, tune search queries/keywords, or debug acquisition quality before extraction tuning. --- # Web Scraper Skill @@ -15,11 +15,18 @@ pipelines. - Ordinance recall is weak across jurisdictions (one-shot extraction). - LLM filtering is compensating for poor search quality. -## Scope +## Do not use -- Query-template strategy. -- URL ranking and filtering patterns. -- Heuristic phrase controls before LLM validation. +- Schema feature definition or value extraction logic design. +- Post-extraction feature/value debugging when retrieval is already correct. + +## Expected assistant output + +When using this skill, return: + +1. The retrieval axis changed (queries, keyword weights, or heuristics). +2. Evidence from artifacts/logs showing why the change was needed. +3. The next run command against the same jurisdiction sample. ## Canonical reference @@ -27,6 +34,12 @@ Consult example plugin configurations in `examples/` following the tech-first na - `_plugin_config.yaml` — standard one-shot config - See `examples/water_rights_demo/one-shot/plugin_config.yaml` for multi-document edge cases +## Scope + +- Query-template strategy. +- URL ranking and filtering patterns. +- Heuristic phrase controls before LLM validation. + ## Two retrieval phases COMPASS runs two sequential acquisition passes per jurisdiction: @@ -141,3 +154,4 @@ When reusing this workflow for any technology: - Preserve auditable rationale for each retrieval change. - Keep one canonical retrieval config per active technology. - Ensure each run uses a unique `out_dir` to avoid COMPASS aborting early. + diff --git a/.github/skills/yaml-setup/SKILL.md b/.github/skills/yaml-setup/SKILL.md index 11a360af..af2a82e5 100644 --- a/.github/skills/yaml-setup/SKILL.md +++ b/.github/skills/yaml-setup/SKILL.md @@ -1,6 +1,6 @@ --- name: yaml-setup -description: Author and tune one-shot plugin YAML configs for COMPASS-native document discovery, filtering, and text collection. +description: Author and tune one-shot plugin YAML for COMPASS document discovery, filtering, and text collection. Use whenever a user asks to create, clean up, standardize, or troubleshoot one-shot plugin YAML for technology onboarding. --- # YAML Setup Skill @@ -17,6 +17,20 @@ filtering, and text collection behavior. - Schema exists but source relevance is weak. - You need reproducible config handoff across teams. +## Do not use + +- Legacy decision-tree parser implementation changes. +- Schema feature semantics work that belongs in `_schema.json`. +- Run-result diagnosis after outputs are generated (use iteration loop skill). + +## Expected assistant output + +When using this skill, return: + +1. The finalized plugin YAML content or exact diff. +2. Any required paired run-config changes. +3. A validation command and pass/fail checks for the edited YAML. + ## Canonical reference With tech-first naming, configuration examples follow this pattern: @@ -244,3 +258,4 @@ If running outside the tech folder, use absolute paths for `-c` and `-p`. - Feature logic belongs in schema. - Adjust one tuning axis per run for clean attribution. - Keep one canonical plugin file per technology in the active example folder. + From 74495a6648822194a966b0e48c80a4b37413c5b8 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Mar 2026 15:21:58 -0600 Subject: [PATCH 05/21] Initial plan (#398) Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> From 81fcbff8b786d1167527d0e0b5a22841ab05055c Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 17 Mar 2026 16:26:22 -0600 Subject: [PATCH 06/21] Fix skills documentation: correct paths, caching behavior, and tab formatting (#399) * Initial plan * Fix all review comments in skills documentation Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> --- .github/skills/extraction-run/SKILL.md | 44 ++++++++++++++------------ .github/skills/web-scraper/SKILL.md | 20 +++++++----- .github/skills/yaml-setup/SKILL.md | 14 +++++--- 3 files changed, 44 insertions(+), 34 deletions(-) diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md index 00be8f92..a356eea2 100644 --- a/.github/skills/extraction-run/SKILL.md +++ b/.github/skills/extraction-run/SKILL.md @@ -56,15 +56,17 @@ only a schema JSON, a plugin YAML, and a run config — no Python source changes New technology assets start in `examples/` and finish in `compass/extraction/`: -1. **Develop** — place all assets in `examples/one_shot_schema_extraction_/` +1. **Develop** — place all assets in `examples/one_shot_schema_extraction/` 2. **Stabilize** — iterate schema/plugin until smoke and robustness gates pass 3. **Promote** — copy the three finalized files into `compass/extraction//`: - `_schema.json` - `_plugin_config.yaml` - `_config.json5` (optional; useful as a reference run config) + - `__init__.py` — registers the plugin via `create_schema_based_one_shot_extraction_plugin` -The promoted extraction folder contains only config files — no Python code is -needed for one-shot techs. + After creating the package, add an import in `compass/extraction/__init__.py` + to register the plugin at startup. See `compass/extraction/ghp/__init__.py` + for a reference implementation. ## Required inputs @@ -78,10 +80,10 @@ needed for one-shot techs. - Jurisdiction CSV has headers `County,State`. - `out_dir` is unique for this run. - At least one acquisition step is enabled: - `perform_se_search: true`, `perform_website_search: true`, - `known_doc_urls`, or `known_local_docs`. + `perform_se_search: true`, `perform_website_search: true`, + `known_doc_urls`, or `known_local_docs`. - If `heuristic_keywords` exists, all four required lists are present and - non-empty. + non-empty. ## Naming convention @@ -106,7 +108,7 @@ to deterministic mode only when search infrastructure is unstable: 2. Use your preferred configured search engine. 3. Load `.env` into shell (`set -a && source .env && set +a`). 4. Run with verbose logs: - - `pixi run compass process -c config.json5 -p plugin.yaml -v` + - `pixi run compass process -c config.json5 -p plugin.yaml -v` 5. Confirm output artifacts exist before tuning schema semantics. Fallback mode when needed: @@ -144,11 +146,11 @@ pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v ## Phase-gated workflow 1. **Smoke test (1 jurisdiction)** - - Goal: verify wiring and output contract. + - Goal: verify wiring and output contract. 2. **Robustness (5 jurisdictions)** - - Goal: verify feature stability and edge-case handling. + - Goal: verify feature stability and edge-case handling. 3. **Scale (full set)** - - Goal: only after earlier phases pass acceptance gates. + - Goal: only after earlier phases pass acceptance gates. ## Validation checklist @@ -194,7 +196,7 @@ Check in order: 1. `outputs/*/cleaned_text/*.txt` (text extraction present) 2. `outputs/*/jurisdiction_dbs/*.csv` (per-jurisdiction parsed rows) 3. `outputs/*/quantitative_ordinances.csv` and - `outputs/*/qualitative_ordinances.csv` (final compiled results) + `outputs/*/qualitative_ordinances.csv` (final compiled results) Treat the run as **failed for extraction quality** when either is true: - `Number of jurisdictions with extracted data: 0` @@ -207,20 +209,20 @@ Only treat a run as passing when both are true: ## Root-cause triage - **Wrong or noisy documents** - - Tune query templates, URL keywords, and exclusions. - - Prefer `known_doc_urls` while stabilizing. + - Tune query templates, URL keywords, and exclusions. + - Prefer `known_doc_urls` while stabilizing. - **Right documents, wrong fields** - - Tune schema descriptions/examples and ambiguity rules. - - Check `extraction_system_prompt` in plugin YAML — it is the primary - guard against scope bleed from generic legal documents. + - Tune schema descriptions/examples and ambiguity rules. + - Check `extraction_system_prompt` in plugin YAML — it is the primary + guard against scope bleed from generic legal documents. - **Correct values, unstable formatting** - - Tighten enums, unit vocabulary, and null behavior. + - Tighten enums, unit vocabulary, and null behavior. - **Nothing downloaded / unstable search** - - Disable live search and use deterministic known URLs/local docs. + - Disable live search and use deterministic known URLs/local docs. - **0 documents found for a jurisdiction during website crawl** - - Expected for jurisdictions with few online ordinances. The website - crawl is a second acquisition pass after search-engine retrieval; - 0 results there is not a pipeline failure. + - Expected for jurisdictions with few online ordinances. The website + crawl is a second acquisition pass after search-engine retrieval; + 0 results there is not a pipeline failure. ## Acceptance gates diff --git a/.github/skills/web-scraper/SKILL.md b/.github/skills/web-scraper/SKILL.md index 27a3fa37..05a078f0 100644 --- a/.github/skills/web-scraper/SKILL.md +++ b/.github/skills/web-scraper/SKILL.md @@ -30,9 +30,12 @@ When using this skill, return: ## Canonical reference -Consult example plugin configurations in `examples/` following the tech-first naming pattern: -- `_plugin_config.yaml` — standard one-shot config -- See `examples/water_rights_demo/one-shot/plugin_config.yaml` for multi-document edge cases +Consult example plugin configurations in `examples/`: +- `examples/one_shot_schema_extraction/plugin_config.yaml` — standard one-shot config +- `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-document edge cases + +When creating new tech configs, use `_plugin_config.yaml` as a recommended +naming convention (e.g. `geothermal_plugin_config.yaml`). ## Scope @@ -49,7 +52,8 @@ COMPASS runs two sequential acquisition passes per jurisdiction: ordinance documents. 2. **Website crawl phase** — crawls the jurisdiction's official website, ranking pages using `website_keywords`. This phase is a secondary pass - and runs even if the SE phase found documents. + and runs only if the search-engine phase did not yield an ordinance + context. Key behaviors: - Playwright browser errors during the website crawl phase are **non-fatal**. @@ -123,10 +127,10 @@ Then validate: 3. Run heuristic filter and review false rejects/accepts (`cleaned_text/`). 4. Check website crawl phase independently if needed (enable, run, inspect logs). 5. Update one axis only: - - query templates (affects SE phase), - - URL weights (affects both phases), - - include/exclude heuristic patterns (pre-LLM filter), - - `NOT_TECH_WORDS` (upstream document rejection). + - query templates (affects SE phase), + - URL weights (affects both phases), + - include/exclude heuristic patterns (pre-LLM filter), + - `NOT_TECH_WORDS` (upstream document rejection). 6. Re-run same sample and compare. ## Cross-tech onboarding diff --git a/.github/skills/yaml-setup/SKILL.md b/.github/skills/yaml-setup/SKILL.md index af2a82e5..1502085c 100644 --- a/.github/skills/yaml-setup/SKILL.md +++ b/.github/skills/yaml-setup/SKILL.md @@ -33,10 +33,15 @@ When using this skill, return: ## Canonical reference -With tech-first naming, configuration examples follow this pattern: -- `examples/one_shot_schema_extraction/_plugin_config.yaml` — standard working example +Consult the working examples in `examples/`: +- `examples/one_shot_schema_extraction/plugin_config.yaml` — standard working example - `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-doc edge case +When creating new tech configs, `_plugin_config.yaml` is the recommended +naming convention (e.g. `geothermal_plugin_config.yaml`). The existing +`plugin_config.yaml` examples use a generic name; new tech-specific assets +should use the tech-first naming pattern. + Refer to any complete example in `examples/` that matches your retrieval goals. ## Naming convention @@ -78,7 +83,7 @@ schema: ./my_schema.json | `collection_prompts` | list or `true` | Text collection prompt(s). If **`true`**, LLM auto-generates from schema. | | `text_extraction_prompts` | list or `true` | Text consolidation prompt(s). If **`true`**, LLM auto-generates from schema. | | `extraction_system_prompt` | string | Overrides default LLM system prompt for the extraction step. Use this to scope extraction tightly to the target technology. | -| `cache_llm_generated_content` | bool | Cache LLM-generated `query_templates` and `website_keywords`. Set to `false` when iterating schema to see live changes. | +| `cache_llm_generated_content` | bool | Cache LLM-generated `query_templates`, `website_keywords`, and `heuristic_keywords`. Set to `false` when iterating schema to see live changes. | ## Required `heuristic_keywords` shape @@ -122,8 +127,7 @@ extraction_system_prompt: |- Prefer explicit values. Use null for qualitative obligations. ``` -See `compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml` -for a complete example. +See `compass/extraction/ghp/plugin_config.yaml` for a complete example. ## Progressive config path From 1b8571f283056d906ee91ccbb8f933966edd6cc8 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Thu, 19 Mar 2026 17:07:05 -0600 Subject: [PATCH 07/21] renamed skills and fixed minor comments --- .../SKILL.md | 28 ++- .github/skills/extraction-run/SKILL.md | 18 +- .github/skills/iteration-development/SKILL.md | 224 ++++++++++++++++++ .../SKILL.md | 18 +- .github/skills/schema-creation/SKILL.md | 5 +- 5 files changed, 266 insertions(+), 27 deletions(-) rename .github/skills/{web-scraper => document-retrieval}/SKILL.md (81%) create mode 100644 .github/skills/iteration-development/SKILL.md rename .github/skills/{yaml-setup => plugin-config-setup}/SKILL.md (91%) diff --git a/.github/skills/web-scraper/SKILL.md b/.github/skills/document-retrieval/SKILL.md similarity index 81% rename from .github/skills/web-scraper/SKILL.md rename to .github/skills/document-retrieval/SKILL.md index 05a078f0..9c077424 100644 --- a/.github/skills/web-scraper/SKILL.md +++ b/.github/skills/document-retrieval/SKILL.md @@ -1,5 +1,5 @@ --- -name: web-scraper +name: document-retrieval description: Build and tune retrieval configs that search, rank, and collect ordinance documents in COMPASS. Use whenever a user asks to improve retrieval precision/recall, tune search queries/keywords, or debug acquisition quality before extraction tuning. --- @@ -87,11 +87,19 @@ Avoid spaces around `=` in `.env` assignments. 5. Start with dynamic search, then switch to deterministic known URLs when search infrastructure is unstable. -When using `heuristic_keywords`, include all required lists: -- `GOOD_TECH_KEYWORDS` -- `GOOD_TECH_PHRASES` -- `GOOD_TECH_ACRONYMS` -- `NOT_TECH_WORDS` +When using `heuristic_keywords`, use these four lists to guide pre-LLM filtering: +- `GOOD_TECH_KEYWORDS` — strong indicators of the target technology + (e.g., facility types, deployment modes). Documents matching even a + few keywords are marked as candidates. +- `GOOD_TECH_PHRASES` — multi-word phrases that signal relevant + ordinance content. Keep specific to avoid false positives. +- `GOOD_TECH_ACRONYMS` — industry-standard abbreviations for the + technology. Narrow list; include only widely recognized acronyms. +- `NOT_TECH_WORDS` — pre-heuristic filter that rejects documents + before keyword matching. Use to exclude adjacent technologies and + irrelevant domains (e.g., residential HVAC, unrelated industries). + Runs first; prevents wasted keyword evaluation on clearly-wrong + documents. If any required list is missing or empty, COMPASS raises a plugin configuration error and extraction quality should be treated as failed. @@ -107,6 +115,10 @@ before using live web search. `code of ordinances`, `use table`, and `special use permit`. ## Deterministic smoke-test mode +For this smoke test, at least one of the following documentation sources must be provided: + +- **`known_doc_urls`**: A list of URLs pointing to external documentation that the scraper can access and parse +- **`known_local_docs`**: A collection of local documentation files available in the repository or system Use run-config controls to bypass flaky search while tuning: @@ -148,8 +160,8 @@ When reusing this workflow for any technology: ## Phase gates - **3 jurisdictions**: ensure major source classes are found. -- **10-25 jurisdictions**: verify stability across regions. -- **Full scale**: only once false positive/negative rates stabilize. +- **10 jurisdictions**: verify stability across regions. + ## Guardrails diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md index a356eea2..ed41439d 100644 --- a/.github/skills/extraction-run/SKILL.md +++ b/.github/skills/extraction-run/SKILL.md @@ -6,7 +6,7 @@ description: Execute one-shot extraction with COMPASS and iterate quickly with l # Extraction Run Skill **ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. -For legacy decision-tree extraction (solar, wind, small wind), consult COMPASS +For decision-tree extraction (solar, wind, small wind), consult COMPASS architecture docs. Use this skill to run one-shot extraction in a repeatable, low-risk way, @@ -17,11 +17,11 @@ then iterate quickly until you have stable structured outputs. - Schema exists and plugin config points to it. - You are onboarding a new technology (diesel generator, geothermal, CHP, hydrogen). - You need a reliable smoke-test workflow before scaling. -- You are NOT using legacy decision-tree extraction. +- You are NOT using decision-tree extraction. ## Do not use -- Legacy decision-tree extraction feature engineering. +- Decision-tree extraction feature engineering. - Python parser implementation in `compass/extraction//parse.py`. - Non-extraction tasks (for example docs-only updates). @@ -47,7 +47,7 @@ them for the same technology: | Mode | Where code lives | Good for | |---|---|---| | **One-shot (schema-based)** | `examples/` → `compass/extraction//` | New techs, no Python changes | -| **Legacy decision-tree** | Python code in `compass/extraction//` | Existing solar, wind, small wind | +| **decision-tree** | Python code in `compass/extraction//` | Existing solar, wind, small wind | One-shot is the correct path for all new technology onboarding. It requires only a schema JSON, a plugin YAML, and a run config — no Python source changes. @@ -61,7 +61,6 @@ New technology assets start in `examples/` and finish in `compass/extraction/`: 3. **Promote** — copy the three finalized files into `compass/extraction//`: - `_schema.json` - `_plugin_config.yaml` - - `_config.json5` (optional; useful as a reference run config) - `__init__.py` — registers the plugin via `create_schema_based_one_shot_extraction_plugin` After creating the package, add an import in `compass/extraction/__init__.py` @@ -77,7 +76,7 @@ New technology assets start in `examples/` and finish in `compass/extraction/`: ## Preflight checks (must pass before run) -- Jurisdiction CSV has headers `County,State`. +- Jurisdiction CSV has headers `County,State` or `County,State,Subdivision,Jurisdiction Type`. - `out_dir` is unique for this run. - At least one acquisition step is enabled: `perform_se_search: true`, `perform_website_search: true`, @@ -89,7 +88,6 @@ New technology assets start in `examples/` and finish in `compass/extraction/`: Use tech-first names for all one-shot assets: -- `_config*.json5` - `_plugin_config.yaml` - `_schema.json` - `_jurisdictions*.csv` @@ -149,8 +147,6 @@ pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v - Goal: verify wiring and output contract. 2. **Robustness (5 jurisdictions)** - Goal: verify feature stability and edge-case handling. -3. **Scale (full set)** - - Goal: only after earlier phases pass acceptance gates. ## Validation checklist @@ -161,10 +157,6 @@ Evaluate each run on: - section/summary traceability, - unit consistency, - null discipline, -- **scope bleed** — check that no features appear in the output CSVs that - fall outside the schema enum; generic land-use-code documents can cause - unrelated provisions to leak through. Tighten `extraction_system_prompt` - in plugin YAML to fix this. ## Expected output artifacts diff --git a/.github/skills/iteration-development/SKILL.md b/.github/skills/iteration-development/SKILL.md new file mode 100644 index 00000000..120bbaef --- /dev/null +++ b/.github/skills/iteration-development/SKILL.md @@ -0,0 +1,224 @@ +--- +name: iteration-development +description: Run → inspect → fix cycle for one-shot extraction after initial setup. Use whenever a user asks to diagnose poor output, reduce scope bleed, improve precision/recall, or scale from smoke tests. +--- + +# Iteration Development Skill + +Use this skill after you have a working schema, plugin YAML, and run config +and want to improve extraction quality through systematic iteration. + +## When to use + +- First smoke run produced output that needs diagnosis or improvement. +- Feature values or units are wrong, missing, or inconsistent. +- Retrieval is returning off-target documents. +- Scaling from 3 jurisdictions to 10–25 or full production. + +## Do not use + +- First-time setup before any successful smoke run. +- Legacy decision-tree extraction development. + +## Expected assistant output + +When using this skill, return: + +1. The observed failure class (retrieval, extraction scope, value/units, or null handling). +2. One concrete fix on a single axis. +3. The re-run command and pass/fail gate check. + +## Canonical reference + +- `examples/one_shot_schema_extraction/` — working examples + to use as a baseline for comparing output quality. + + +## The run → inspect → fix loop + +**Three Phases:** This skill guides you through three phases, all built into +example plugin configurations in the `examples/` directory. + +Repeat this cycle once per iteration. Change exactly one axis per cycle. + +``` +Run → Inspect outputs → Identify failure → Fix one axis → Re-run same sample +``` + +**Never change multiple axes in the same iteration.** You will not know +which change caused the result. + +**Phases encoded in plugin YAML comments:** + +- **Phase 1 (Initial):** Includes query templates, website keywords, and + basic heuristic filters to avoid obvious off-domain results. + **This is ready to run immediately.** +- **Phase 2 (Optional Refinement):** Uncomment advanced heuristic tuning + if Phase 1 retrieval produces off-target documents. +- **Phase 3 (Optional Refinement):** Uncomment extraction_system_prompt + if Phase 1-2 retrieval works but extracted features are wrong (scope bleed). + +Start with Phase 1. Only add Phase 2 / 3 if Phase 1 results need improvement. +See README.rst for the progression path. + + +## Step 1: Inspect output artifacts + +After each run, check these locations inside `out_dir`: + +| Artifact | What to look for | +|---|---| +| `ordinance_files/*.pdf` | Are these on-target documents? | +| `cleaned_text/*.txt` | Does page text contain target technology language? | +| `jurisdiction_dbs/*.csv` | Are feature rows present? Are values correct? | +| `quantitative_ordinances.csv` and `qualitative_ordinances.csv` | Final compiled output — check feature coverage and null rate | +| `logs//*.log` | Error messages, 0-document warnings | + +Minimum passing state for a smoke run: +- At least one `ordinance_files/` PDF per jurisdiction. +- At least one `cleaned_text/` file per jurisdiction. +- Compiled ordinance CSV outputs contain rows for most jurisdictions. + +Immediate fail conditions (fix before any tuning): +- Jurisdiction CSV header mismatch (must include at least `County,State`). +- Plugin configuration exceptions in logs (for example missing required + `heuristic_keywords` lists). +- `Number of jurisdictions with extracted data: 0`. + + +## Step 2: Classify the failure + +Use this decision tree for any defect: + +``` +Is the right document being retrieved? + └─ No → retrieval problem → fix query templates / heuristic_keywords + └─ Yes + Is the document text present in cleaned_text/? + └─ No → text extraction problem → check PDF quality / OCR + └─ Yes + Are the right features being extracted? + └─ No, wrong feature names → schema enum or description problem + └─ No, off-domain features → scope bleed → fix extraction_system_prompt + └─ Yes, but wrong values/units → schema description or units problem + └─ Yes, but nulls where values should be → schema IGNORE clause too broad +``` + + +## Step 3: Fix the right axis + +### Retrieval problems (wrong or missing documents) + +Fix in plugin YAML: +- Add more specific `query_templates` with legal code terms + (e.g., `"filetype:pdf {jurisdiction} generator zoning code"`). +- Add target technology terms to `GOOD_TECH_KEYWORDS` and + `GOOD_TECH_PHRASES`. +- Add adjacent-technology terms being confounded to `NOT_TECH_WORDS`. +- Increase `website_keywords` score for the most discriminating terms. + +Required `heuristic_keywords` keys when present: +- `GOOD_TECH_KEYWORDS` +- `GOOD_TECH_PHRASES` +- `GOOD_TECH_ACRONYMS` +- `NOT_TECH_WORDS` + +### Scope bleed (off-domain features extracted) + +Fix in plugin YAML `extraction_system_prompt`: +- State explicitly what is excluded (e.g., "Do not extract requirements for + residential portable generators"). +- Add the same language to `$instructions.scope` in the schema for + reinforcement. + +### Wrong values or units + +Fix in schema JSON, in the affected feature's `description`: +- Add or sharpen the `VALUE` rule. +- Expand the `UNITS` vocabulary list. +- Add a `IGNORE` clause for the near-miss case. + +### Missing values (nulls where data exists) + +Fix in schema JSON: +- Broaden the feature description to cover the phrasing used in source + documents. +- Remove overly restrictive IGNORE clauses. +- Check that the feature ID is spelled exactly as it appears in the enum. + +### Text extraction failures (blank cleaned_text) + +- Verify the PDF is readable (not scanned without OCR). +- Add `from_ocr: true` to the doc entry in `known_local_docs`. +- Set `pytesseract_exe_fp` in run config if OCR is needed. + + +## Iteration hygiene + +- Use a **unique `out_dir`** per iteration run. COMPASS aborts early if the + output directory already contains results. +- Keep the same small jurisdiction sample across all iterations until + quality gates pass. +- Record what changed and why in a short comment in the config file or + a separate `CHANGELOG.md` in the example folder. +- Save schema versions as `_schema_v2.json` etc. to + preserve a diff history. Point `schema:` in plugin YAML to the active + version. + + +## Scale-up protocol + +Only advance to the next phase when the current phase passes all gates. + +| Phase | Jurisdictions | Gates | +|---|---|---| +| Smoke | 1–3 | Output rows exist; feature names match schema enum; section/summary present for most rows | +| Robustness | 10–25 | Feature value types are stable; null rate is explainable; no scope bleed | +| Production | Full national set | False positive/negative rates acceptable; repeated runs show minimal drift | + +When advancing, keep the same config files. Only change the jurisdictions CSV. + + +## Diagnostic commands + +```bash +# Check if cleaned text was produced +ls outputs/*/cleaned_text/ + +# Count output rows per jurisdiction +wc -l outputs/*/jurisdiction_dbs/*.csv + +# Check for scope bleed — feature values that are off-domain +grep -v "diesel\|generator\|backup\|emergency" outputs/ordinances.csv | head -20 + +# View logs for a specific jurisdiction +cat outputs/logs/San\ Diego*/run.log | grep -i "error\|warning\|found 0" +``` + + +## Common failure modes + +| Symptom | Most likely cause | Fix axis | +|---|---|---| +| 0 documents for all jurisdictions | Credentials not loaded / search API down | Load `.env`; use `known_doc_urls` | +| Downloaded PDFs are from wrong domain | `query_templates` too generic | Narrow queries with `filetype:pdf` and legal code terms | +| `cleaned_text` present but no output CSV rows | Schema enum mismatch or extraction prompt failing | Check schema path in plugin YAML; verify `tech` value in run config | +| Off-domain feature names in output | Scope bleed from large land-use code | Add exclusion language to `extraction_system_prompt` | +| Correct features but wrong values | Feature description lacks VALUE rule | Add explicit VALUE rule to affected descriptions | +| Setback in wrong units | UNITS rule missing or implicit | Add explicit UNITS vocabulary to description | +| Null rows for features that are in the document | IGNORE clause too broad, or feature description doesn't match source phrasing | Broaden description; remove over-strict IGNORE clause | +| Playwright timeout errors in logs | Website crawl phase browser failure | Non-fatal; COMPASS continues. Use `known_doc_urls` while iterating | + + +## Acceptance criteria before promotion + +A technology is ready to promote from `examples/` to +`compass/extraction//` when all of the following are true on the +robustness run (10–25 jurisdictions): + +- [ ] Output CSV rows conform to required schema contract. +- [ ] Feature IDs are stable and match the schema enum exactly. +- [ ] Most non-null rows include a useful `section` and `summary`. +- [ ] Repeated runs on the same sample show minimal drift. +- [ ] No scope bleed (off-domain features) is observed. +- [ ] Null rate for common features is explainable (jurisdiction has no rule). diff --git a/.github/skills/yaml-setup/SKILL.md b/.github/skills/plugin-config-setup/SKILL.md similarity index 91% rename from .github/skills/yaml-setup/SKILL.md rename to .github/skills/plugin-config-setup/SKILL.md index 1502085c..57ec663d 100644 --- a/.github/skills/yaml-setup/SKILL.md +++ b/.github/skills/plugin-config-setup/SKILL.md @@ -1,5 +1,5 @@ --- -name: yaml-setup +name: plugin-config-setup description: Author and tune one-shot plugin YAML for COMPASS document discovery, filtering, and text collection. Use whenever a user asks to create, clean up, standardize, or troubleshoot one-shot plugin YAML for technology onboarding. --- @@ -87,6 +87,20 @@ schema: ./my_schema.json ## Required `heuristic_keywords` shape +When using `heuristic_keywords`, use these four lists to guide pre-LLM filtering: +- `GOOD_TECH_KEYWORDS` — strong indicators of the target technology + (e.g., facility types, deployment modes). Documents matching even a + few keywords are marked as candidates. +- `GOOD_TECH_PHRASES` — multi-word phrases that signal relevant + ordinance content. Keep specific to avoid false positives. +- `GOOD_TECH_ACRONYMS` — industry-standard abbreviations for the + technology. Narrow list; include only widely recognized acronyms. +- `NOT_TECH_WORDS` — pre-heuristic filter that rejects documents + before keyword matching. Use to exclude adjacent technologies and + irrelevant domains (e.g., residential HVAC, unrelated industries). + Runs first; prevents wasted keyword evaluation on clearly-wrong + documents. + Use this exact structure when defining `heuristic_keywords`: ```yaml @@ -215,8 +229,6 @@ values: } ``` -Do not hardcode model names in skills. Prompt the user to supply `name`. - ## Acquisition phases COMPASS acquisition runs in two sequential phases per jurisdiction: diff --git a/.github/skills/schema-creation/SKILL.md b/.github/skills/schema-creation/SKILL.md index c4941bc1..08981132 100644 --- a/.github/skills/schema-creation/SKILL.md +++ b/.github/skills/schema-creation/SKILL.md @@ -119,7 +119,7 @@ Organize `$definitions` by these families: | Physical design | `screening requirement`, `enclosure requirement`, `exhaust stack height` | | Zoning | `primary use districts`, `conditional use districts`, `prohibited use districts` | | Permitting | `permit requirement`, `capacity threshold` | -| Compliance | `decommissioning`, `enactment date` | +| Compliance | `decommissioning` | ## `$instructions` block @@ -129,7 +129,6 @@ Always include a `$instructions` object at the top level with these keys: "$instructions": { "scope": "Describe exactly what to extract and what to ignore.", "null_handling": "Output every enum feature. Use null value and null summary when a feature is not found in the document. Do not omit features.", - "one_row_per_feature": "Output exactly one row per feature. If multiple values apply, use the most restrictive and describe variants in summary.", "verbatim_quotes": "In summary fields, prefer verbatim quotes from the source. Enclose in double quotation marks.", "units_discipline": "Do not convert units. Record them exactly as they appear in the document." } @@ -150,7 +149,7 @@ Do not expand the feature enum to absorb scope bleed. Narrow the prompt. ## Cross-technology adaptation checklist -When cloning this schema for a new technology: +When cloning a schema for a new technology: - [ ] Replace all feature IDs with technology-specific names. - [ ] Replace value/units rules in every feature description. From 61f9b2e3b9d3f37af260ee8639a70b2b75b74eb3 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Fri, 20 Mar 2026 17:43:59 -0600 Subject: [PATCH 08/21] fix URL space encoding and anchor text scoring in website crawl retrieval - percent-encode raw spaces in crawl4ai PDF source URLs before downstream use - populate link text field from anchor text so ELMLinkScorer can score link labels - add two regression tests covering both fixes --- .../geothermal_config.json5 | 28 ----------- .../geothermal_plugin_config.yaml | 41 +--------------- compass/scripts/download.py | 49 ++++++++++++++++++- compass/web/website_crawl.py | 1 + tests/python/unit/web/test_web_crawl.py | 30 ++++++++++++ 5 files changed, 80 insertions(+), 69 deletions(-) delete mode 100644 compass/extraction/geothermal_electricity/geothermal_config.json5 diff --git a/compass/extraction/geothermal_electricity/geothermal_config.json5 b/compass/extraction/geothermal_electricity/geothermal_config.json5 deleted file mode 100644 index 5bd8037b..00000000 --- a/compass/extraction/geothermal_electricity/geothermal_config.json5 +++ /dev/null @@ -1,28 +0,0 @@ -// geothermal_config_demo.json5 -{ - "out_dir": "./outputs_geothermal_dev_one", - "tech": "geothermal_electricity", - "jurisdiction_fp": "./geothermal_jurisdictions_one.csv", - "search_engines": [ - { - "se_name": "SerpAPIGoogleSearch", - "verify": false - } - ], - "file_loader_kwargs": { - "verify_ssl": false - }, - "model": [ - { - "name": "compassop-gpt-4.1-mini", - "llm_call_kwargs": { - "temperature": 0, - "timeout": 600 - }, - "client_kwargs": { - "api_version": "2025-04-01-preview", - "azure_endpoint": "https://aoai-prod-eastus-compassop-001.openai.azure.com/" - } - } - ] -} diff --git a/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml index a983158a..9843c20a 100644 --- a/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml +++ b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml @@ -164,43 +164,4 @@ heuristic_keywords: - "battery storage" - "energy storage system" - "hydroelectric" - - "biomass" - -collection_prompts: true - -text_extraction_prompts: true - -extraction_system_prompt: |- - You are a legal scholar extracting structured data from geothermal - electricity ordinances and regulations. - - Be focused and literal: extract only enacted, explicit, in-scope - requirements. - Be thorough and complete: review all relevant sections, including tables, - use tables, footnotes, and lists, so no explicitly stated feature is - missed. - Before finalizing, perform a feature-coverage check against the schema enum - and ensure each explicitly supported feature is captured at most once. - - Follow all schema instructions exactly. - Extract only enacted requirements that apply to utility-scale geothermal - electricity generation, geothermal exploration or drilling, geothermal - wells, geothermal power plants, or directly associated generation - facilities such as substations or gen-tie lines when the ordinance - explicitly governs them. - - Do not extract rules that apply only to geothermal heat pumps, HVAC, - direct-use geothermal, district heating, residential systems, or other - non-generation technologies. - - State-level statutes or regulations are valid when they govern local or - project-level siting, permitting, drilling, setbacks, environmental - controls, decommissioning, or operating requirements. - - Prefer explicit enacted values over interpretation. - Use null values and null units for qualitative obligations when the - summary carries the operative legal requirement. - Treat fracking or hydraulic-fracturing language as in scope only when the - ordinance explicitly uses it to regulate or prohibit geothermal - electricity development. - Keep summaries source-faithful and include important conditions. + - "biomass" \ No newline at end of file diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 556bebc2..9199eed8 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -2,6 +2,14 @@ import logging from contextlib import AsyncExitStack +from urllib.parse import ( + parse_qsl, + quote, + unquote, + urlencode, + urlparse, + urlunparse, +) from elm.web.document import PDFDocument from elm.web.search.run import ( @@ -362,12 +370,20 @@ async def _crawl_hook(*__, **___): # noqa: RUF029 ch = None async with crawl_semaphore, cpb: - return await crawler.run( + docs_or_pair = await crawler.run( website, on_result_hook=ch, return_c4ai_results=return_c4ai_results, ) + if return_c4ai_results: + docs, c4ai_results = docs_or_pair + _sanitize_doc_sources(docs) + return docs, c4ai_results + + _sanitize_doc_sources(docs_or_pair) + return docs_or_pair + async def download_jurisdiction_ordinances_from_website_compass_crawl( website, @@ -788,6 +804,37 @@ async def _contains_relevant_text( return found_text +def _sanitize_url(url): + """Percent-encode spaces and unsafe characters in a URL path""" + parsed = urlparse(url) + safe_path = quote(unquote(parsed.path), safe="/") + query_params = parse_qsl(parsed.query, keep_blank_values=True) + safe_query = urlencode(query_params, doseq=True) # cspell: disable-line + return urlunparse(( + parsed.scheme, + parsed.netloc, + safe_path, + parsed.params, + safe_query, + parsed.fragment, + )) + + +def _sanitize_doc_sources(docs): + """Rewrite source attrs on documents returned by ELMWebsiteCrawler + + crawl4ai can surface PDF URLs containing raw spaces (e.g. filenames + like "Land Use Code.pdf"). These fail when the file loader issues + an HTTP request because spaces are invalid in a URL path. This + function percent-encodes each document's ``source`` attribute + in-place so that all downstream consumers receive a valid URL. + """ + for doc in docs: + source = doc.attrs.get("source") + if source and " " in source: + doc.attrs["source"] = _sanitize_url(source) + + def _sort_final_ord_docs(all_ord_docs): """Sort ordinance documents by desirability heuristics""" if not all_ord_docs: diff --git a/compass/web/website_crawl.py b/compass/web/website_crawl.py index 63df168c..a5fd2093 100644 --- a/compass/web/website_crawl.py +++ b/compass/web/website_crawl.py @@ -532,6 +532,7 @@ def _extract_links_from_html(text, base_url): return { _Link( title=title, + text=title, href=_sanitize_url(urljoin(base_url, path)), base_domain=base_url, ) diff --git a/tests/python/unit/web/test_web_crawl.py b/tests/python/unit/web/test_web_crawl.py index 4e99a68f..24ae0e61 100644 --- a/tests/python/unit/web/test_web_crawl.py +++ b/tests/python/unit/web/test_web_crawl.py @@ -260,6 +260,36 @@ def test_extract_links_from_html_filters_blacklist(): assert "https://example.com/ok.pdf" in test_refs +def test_extract_links_from_html_sets_text_from_anchor(): + """Anchor text should populate both link title and text""" + + html = """ + Permit Standards + """ + links = _extract_links_from_html(html, base_url="https://example.com") + link = next(iter(links)) + assert link.title == "Permit Standards" + assert link.text == "Permit Standards" + + +@pytest.mark.asyncio +async def test_compass_link_scorer_scores_anchor_text(): + """COMPASSLinkScorer must score anchor text via the 'text' key""" + + scorer = COMPASSLinkScorer(keyword_points={"permit": 10, "pdf": 3}) + links = [ + { + "text": "Permit Standards", + "href": "https://example.com/doc.pdf", + "title": "Permit Standards", + }, + {"text": "", "href": "https://example.com/index.html", "title": ""}, + ] + scored = await scorer.score(links) + assert scored[0]["score"] == 13 + assert scored[1]["score"] == 0 + + def test_debug_info_on_links_logs_expected( compass_logger, assert_message_was_logged ): From bac918ca079e35d074df507e7fab2812e36ae26f Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Fri, 20 Mar 2026 17:44:25 -0600 Subject: [PATCH 09/21] remove testing skills --- .github/skills/document-retrieval/SKILL.md | 173 ----------- .github/skills/extraction-run/SKILL.md | 274 ----------------- .github/skills/iteration-development/SKILL.md | 224 -------------- .github/skills/plugin-config-setup/SKILL.md | 277 ------------------ .github/skills/schema-creation/SKILL.md | 178 ----------- 5 files changed, 1126 deletions(-) delete mode 100644 .github/skills/document-retrieval/SKILL.md delete mode 100644 .github/skills/extraction-run/SKILL.md delete mode 100644 .github/skills/iteration-development/SKILL.md delete mode 100644 .github/skills/plugin-config-setup/SKILL.md delete mode 100644 .github/skills/schema-creation/SKILL.md diff --git a/.github/skills/document-retrieval/SKILL.md b/.github/skills/document-retrieval/SKILL.md deleted file mode 100644 index 9c077424..00000000 --- a/.github/skills/document-retrieval/SKILL.md +++ /dev/null @@ -1,173 +0,0 @@ ---- -name: document-retrieval -description: Build and tune retrieval configs that search, rank, and collect ordinance documents in COMPASS. Use whenever a user asks to improve retrieval precision/recall, tune search queries/keywords, or debug acquisition quality before extraction tuning. ---- - -# Web Scraper Skill - -Use this skill to improve retrieval precision/recall before extraction tuning. -Applies to both one-shot (schema-driven) and legacy decision-tree extraction -pipelines. - -## When to use - -- Download step returns noisy sources (one-shot extraction). -- Ordinance recall is weak across jurisdictions (one-shot extraction). -- LLM filtering is compensating for poor search quality. - -## Do not use - -- Schema feature definition or value extraction logic design. -- Post-extraction feature/value debugging when retrieval is already correct. - -## Expected assistant output - -When using this skill, return: - -1. The retrieval axis changed (queries, keyword weights, or heuristics). -2. Evidence from artifacts/logs showing why the change was needed. -3. The next run command against the same jurisdiction sample. - -## Canonical reference - -Consult example plugin configurations in `examples/`: -- `examples/one_shot_schema_extraction/plugin_config.yaml` — standard one-shot config -- `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-document edge cases - -When creating new tech configs, use `_plugin_config.yaml` as a recommended -naming convention (e.g. `geothermal_plugin_config.yaml`). - -## Scope - -- Query-template strategy. -- URL ranking and filtering patterns. -- Heuristic phrase controls before LLM validation. - -## Two retrieval phases - -COMPASS runs two sequential acquisition passes per jurisdiction: - -1. **Search-engine phase** — queries `SerpAPIGoogleSearch` (or configured - engine) using `query_templates`. This phase is the primary source of - ordinance documents. -2. **Website crawl phase** — crawls the jurisdiction's official website, - ranking pages using `website_keywords`. This phase is a secondary pass - and runs only if the search-engine phase did not yield an ordinance - context. - -Key behaviors: -- Playwright browser errors during the website crawl phase are **non-fatal**. - COMPASS logs the error and continues. -- `Found 0 potential documents` at the end of the crawl phase is **expected** - for jurisdictions without relevant online ordinances. -- Disable the crawl phase with `perform_website_search: false` in run config - when you want faster smoke tests or Playwright is unavailable. - -## Key management - -For SerpAPI-backed search, keep `api_key` out of committed config and provide -`SERPAPI_KEY` via environment (for example through `.env` loaded in shell). - -Recommended shell setup: - -```bash -set -a -source .env -set +a -``` - -Avoid spaces around `=` in `.env` assignments. - -## Retrieval design pattern - -1. Create 3-7 jurisdiction queries with `{jurisdiction}`. -2. Weight legal document indicators in URL keywords. -3. Apply exclusions for templates/reports/slides. -4. Add focused negative tech terms to reduce false positives. -5. Start with dynamic search, then switch to deterministic known URLs when - search infrastructure is unstable. - -When using `heuristic_keywords`, use these four lists to guide pre-LLM filtering: -- `GOOD_TECH_KEYWORDS` — strong indicators of the target technology - (e.g., facility types, deployment modes). Documents matching even a - few keywords are marked as candidates. -- `GOOD_TECH_PHRASES` — multi-word phrases that signal relevant - ordinance content. Keep specific to avoid false positives. -- `GOOD_TECH_ACRONYMS` — industry-standard abbreviations for the - technology. Narrow list; include only widely recognized acronyms. -- `NOT_TECH_WORDS` — pre-heuristic filter that rejects documents - before keyword matching. Use to exclude adjacent technologies and - irrelevant domains (e.g., residential HVAC, unrelated industries). - Runs first; prevents wasted keyword evaluation on clearly-wrong - documents. - -If any required list is missing or empty, COMPASS raises a plugin -configuration error and extraction quality should be treated as failed. - -For first-pass reliability, test retrieval with deterministic known URLs -before using live web search. - -## Technology-specific retrieval controls (template) - -- Include target-technology facility/deployment terms. -- Exclude adjacent and non-target terms (residential/HVAC/PV/etc as needed). -- Favor jurisdictional legal-code signals like `land use code`, - `code of ordinances`, `use table`, and `special use permit`. - -## Deterministic smoke-test mode -For this smoke test, at least one of the following documentation sources must be provided: - -- **`known_doc_urls`**: A list of URLs pointing to external documentation that the scraper can access and parse -- **`known_local_docs`**: A collection of local documentation files available in the repository or system - -Use run-config controls to bypass flaky search while tuning: - -- supply `known_doc_urls` or `known_local_docs`, -- set `perform_se_search: false`, -- set `perform_website_search: false`. - -Then validate: - -- download artifacts exist, -- cleaned text exists, -- ordinance DB rows are non-empty. - -## Tuning loop - -1. Run SE-search phase on small sample. -2. Inspect kept vs discarded PDFs (`ordinance_files/`). -3. Run heuristic filter and review false rejects/accepts (`cleaned_text/`). -4. Check website crawl phase independently if needed (enable, run, inspect logs). -5. Update one axis only: - - query templates (affects SE phase), - - URL weights (affects both phases), - - include/exclude heuristic patterns (pre-LLM filter), - - `NOT_TECH_WORDS` (upstream document rejection). -6. Re-run same sample and compare. - -## Cross-tech onboarding - -When reusing this workflow for any technology: - -- keep legal retrieval tokens (`ordinance`, `zoning`, `code`), -- replace all technology terms in `query_templates`, `website_keywords`, - and `heuristic_keywords`, -- seed `known_doc_urls` with authoritative regulatory documents for smoke - testing, -- avoid copying negatives from previous technologies into the new tech config, -- verify `NOT_TECH_WORDS` excludes adjacent technologies for your domain. - -## Phase gates - -- **3 jurisdictions**: ensure major source classes are found. -- **10 jurisdictions**: verify stability across regions. - - -## Guardrails - -- Keep feature extraction logic out of retrieval config. -- Do not overfit to one county's document style. -- Preserve auditable rationale for each retrieval change. -- Keep one canonical retrieval config per active technology. -- Ensure each run uses a unique `out_dir` to avoid COMPASS aborting early. - diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md deleted file mode 100644 index ed41439d..00000000 --- a/.github/skills/extraction-run/SKILL.md +++ /dev/null @@ -1,274 +0,0 @@ ---- -name: extraction-run -description: Execute one-shot extraction with COMPASS and iterate quickly with low cost. Use whenever a user asks to run, smoke-test, validate, debug, or scale one-shot schema extraction for any technology. ---- - -# Extraction Run Skill - -**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. -For decision-tree extraction (solar, wind, small wind), consult COMPASS -architecture docs. - -Use this skill to run one-shot extraction in a repeatable, low-risk way, -then iterate quickly until you have stable structured outputs. - -## When to use - -- Schema exists and plugin config points to it. -- You are onboarding a new technology (diesel generator, geothermal, CHP, hydrogen). -- You need a reliable smoke-test workflow before scaling. -- You are NOT using decision-tree extraction. - -## Do not use - -- Decision-tree extraction feature engineering. -- Python parser implementation in `compass/extraction//parse.py`. -- Non-extraction tasks (for example docs-only updates). - -## Expected assistant output - -When using this skill, return: - -1. The exact `pixi run compass process ...` command used. -2. A pass/fail decision against extraction-quality gates. -3. The smallest next config/schema change and why. - -## Canonical reference - -- `examples/one_shot_schema_extraction/` — complete working examples -- `examples/one_shot_schema_extraction/README.rst` — general one-shot overview -- `examples/water_rights_demo/one-shot/` — multi-doc extraction example - -## Two-pipeline modes - -COMPASS supports two distinct extraction pipelines. Choose one and do not mix -them for the same technology: - -| Mode | Where code lives | Good for | -|---|---|---| -| **One-shot (schema-based)** | `examples/` → `compass/extraction//` | New techs, no Python changes | -| **decision-tree** | Python code in `compass/extraction//` | Existing solar, wind, small wind | - -One-shot is the correct path for all new technology onboarding. It requires -only a schema JSON, a plugin YAML, and a run config — no Python source changes. - -## Tech promotion lifecycle - -New technology assets start in `examples/` and finish in `compass/extraction/`: - -1. **Develop** — place all assets in `examples/one_shot_schema_extraction/` -2. **Stabilize** — iterate schema/plugin until smoke and robustness gates pass -3. **Promote** — copy the three finalized files into `compass/extraction//`: - - `_schema.json` - - `_plugin_config.yaml` - - `__init__.py` — registers the plugin via `create_schema_based_one_shot_extraction_plugin` - - After creating the package, add an import in `compass/extraction/__init__.py` - to register the plugin at startup. See `compass/extraction/ghp/__init__.py` - for a reference implementation. - -## Required inputs - -- Run config for `compass process`. -- Plugin config containing `schema`. -- API keys in environment (never hardcode in configs). -- A jurisdiction set sized to the current phase. - -## Preflight checks (must pass before run) - -- Jurisdiction CSV has headers `County,State` or `County,State,Subdivision,Jurisdiction Type`. -- `out_dir` is unique for this run. -- At least one acquisition step is enabled: - `perform_se_search: true`, `perform_website_search: true`, - `known_doc_urls`, or `known_local_docs`. -- If `heuristic_keywords` exists, all four required lists are present and - non-empty. - -## Naming convention - -Use tech-first names for all one-shot assets: - -- `_plugin_config.yaml` -- `_schema.json` -- `_jurisdictions*.csv` - -The `tech` value in the run config must be a string that becomes the plugin -registry identifier. It must be unique, lowercase, and underscore-separated -(for example `concentrating_solar`, `geothermal_electricity`). COMPASS will -raise `Unknown tech input` if this key does not match any registered plugin. - -## Canonical development pattern - -For early development, start with the proven dynamic baseline, then fall back -to deterministic mode only when search infrastructure is unstable: - -1. Use one small jurisdiction file (1-3 rows). -2. Use your preferred configured search engine. -3. Load `.env` into shell (`set -a && source .env && set +a`). -4. Run with verbose logs: - - `pixi run compass process -c config.json5 -p plugin.yaml -v` -5. Confirm output artifacts exist before tuning schema semantics. - -Fallback mode when needed: - -- Add `known_doc_urls` (or `known_local_docs`) in run config. -- Set `perform_se_search: false` and `perform_website_search: false`. - -## Adaptation rule - -When adapting this workflow for a new technology, keep the run structure -unchanged and swap only technology-specific inputs: - -- `tech` in run config, -- schema file, -- plugin descriptor (`data_type_short_desc`), -- retrieval query/keyword vocabulary, -- known document URL set. - -Change one axis per run unless debugging infrastructure failures. - -## Environment setup - -Load secrets from `.env` before running. Never commit key values in config files. - -```bash -set -a && source .env && set +a # no spaces around = in .env assignments -``` - -## Core command - -```bash -pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v -``` - -## Phase-gated workflow - -1. **Smoke test (1 jurisdiction)** - - Goal: verify wiring and output contract. -2. **Robustness (5 jurisdictions)** - - Goal: verify feature stability and edge-case handling. - -## Validation checklist - -Evaluate each run on: - -- document relevance (exclude off-domain content), -- feature coverage vs expected ordinance topics, -- section/summary traceability, -- unit consistency, -- null discipline, - -## Expected output artifacts - -A successful run produces these files under `out_dir`: - -| Artifact | Meaning | -|---|---| -| `ordinance_files/*.pdf` | Downloaded source documents | -| `cleaned_text/*.txt` | Heuristic-filtered extracted text | -| `jurisdiction_dbs/*.csv` | Per-jurisdiction raw extraction rows | -| `quantitative_ordinances.csv` | Final compiled numeric features | -| `qualitative_ordinances.csv` | Final compiled qualitative features | -| `usage.json` | Per-jurisdiction LLM token and request counts | -| `meta.json` | Run metadata (cost, timing, version) | - -Final CSV columns: `county`, `state`, `subdivision`, `jurisdiction_type`, -`FIPS`, `feature`, `value`, `units`, `adder`, `min_dist`, `max_dist`, -`summary`, `year`, `section`, `source`. - -## Interpreting output status correctly - -`cleaned_text` files can exist while `Number of documents found` is `0`. - -This means acquisition/text collection worked, but no final structured ordinance -rows were emitted into consolidated DB outputs. - -Check in order: - -1. `outputs/*/cleaned_text/*.txt` (text extraction present) -2. `outputs/*/jurisdiction_dbs/*.csv` (per-jurisdiction parsed rows) -3. `outputs/*/quantitative_ordinances.csv` and - `outputs/*/qualitative_ordinances.csv` (final compiled results) - -Treat the run as **failed for extraction quality** when either is true: -- `Number of jurisdictions with extracted data: 0` -- any configuration exception appears in logs (even if process exits 0) - -Only treat a run as passing when both are true: -- at least one jurisdiction has extracted data -- at least one jurisdiction CSV in `jurisdiction_dbs/` has more than header row - -## Root-cause triage - -- **Wrong or noisy documents** - - Tune query templates, URL keywords, and exclusions. - - Prefer `known_doc_urls` while stabilizing. -- **Right documents, wrong fields** - - Tune schema descriptions/examples and ambiguity rules. - - Check `extraction_system_prompt` in plugin YAML — it is the primary - guard against scope bleed from generic legal documents. -- **Correct values, unstable formatting** - - Tighten enums, unit vocabulary, and null behavior. -- **Nothing downloaded / unstable search** - - Disable live search and use deterministic known URLs/local docs. -- **0 documents found for a jurisdiction during website crawl** - - Expected for jurisdictions with few online ordinances. The website - crawl is a second acquisition pass after search-engine retrieval; - 0 results there is not a pipeline failure. - -## Acceptance gates - -Do not advance phases until all are true: - -- Output rows conform to required contract. -- High share of rows include useful `section` and `summary`. -- Feature names are stable and machine-consistent. -- Repeated runs on same sample show minimal drift. - -## Cost and speed controls - -- Keep sample size minimal while tuning. -- Change one variable per run. -- Archive run command, input set, and output path for each iteration. - -## Workspace hygiene (important) - -Keep one canonical working set per technology in `examples/`: - -- one run config, -- one plugin config, -- one schema, -- one jurisdiction file, -- one known docs file. - -Delete stale `_migrated`, `_smoke`, and duplicate output folders to avoid -configuration drift and debugging confusion. - -## Known infrastructure issues - -### Playwright timeouts - -Web search via `rebrowser_playwright` may fail with 60s timeouts on -`Page.wait_for_selector`. Symptoms: -- `TimeoutError: Page.wait_for_selector: Timeout 60000ms exceeded` -- All search queries fail consistently -- Browser session crashes with `ProtocolError: Internal server error, session closed` - -These errors during the **website crawl phase** (second acquisition pass) are -**non-fatal**. COMPASS logs them and continues. They do not block the -search-engine phase or extraction. - -If search itself is failing, verify provider credentials are loaded and fall -back to deterministic mode. - -**Workaround**: Use `known_local_docs` or `known_doc_urls` and disable -search/website steps while validating extraction logic. - -### known_local_docs loading failures - -`known_local_docs` may fail silently with `ERROR: Failed to read file` in -jurisdiction logs due to external loader behavior. - -**Workaround**: Prefer `known_doc_urls` for deterministic smoke tests and -pre-validate local docs before pipeline runs. - diff --git a/.github/skills/iteration-development/SKILL.md b/.github/skills/iteration-development/SKILL.md deleted file mode 100644 index 120bbaef..00000000 --- a/.github/skills/iteration-development/SKILL.md +++ /dev/null @@ -1,224 +0,0 @@ ---- -name: iteration-development -description: Run → inspect → fix cycle for one-shot extraction after initial setup. Use whenever a user asks to diagnose poor output, reduce scope bleed, improve precision/recall, or scale from smoke tests. ---- - -# Iteration Development Skill - -Use this skill after you have a working schema, plugin YAML, and run config -and want to improve extraction quality through systematic iteration. - -## When to use - -- First smoke run produced output that needs diagnosis or improvement. -- Feature values or units are wrong, missing, or inconsistent. -- Retrieval is returning off-target documents. -- Scaling from 3 jurisdictions to 10–25 or full production. - -## Do not use - -- First-time setup before any successful smoke run. -- Legacy decision-tree extraction development. - -## Expected assistant output - -When using this skill, return: - -1. The observed failure class (retrieval, extraction scope, value/units, or null handling). -2. One concrete fix on a single axis. -3. The re-run command and pass/fail gate check. - -## Canonical reference - -- `examples/one_shot_schema_extraction/` — working examples - to use as a baseline for comparing output quality. - - -## The run → inspect → fix loop - -**Three Phases:** This skill guides you through three phases, all built into -example plugin configurations in the `examples/` directory. - -Repeat this cycle once per iteration. Change exactly one axis per cycle. - -``` -Run → Inspect outputs → Identify failure → Fix one axis → Re-run same sample -``` - -**Never change multiple axes in the same iteration.** You will not know -which change caused the result. - -**Phases encoded in plugin YAML comments:** - -- **Phase 1 (Initial):** Includes query templates, website keywords, and - basic heuristic filters to avoid obvious off-domain results. - **This is ready to run immediately.** -- **Phase 2 (Optional Refinement):** Uncomment advanced heuristic tuning - if Phase 1 retrieval produces off-target documents. -- **Phase 3 (Optional Refinement):** Uncomment extraction_system_prompt - if Phase 1-2 retrieval works but extracted features are wrong (scope bleed). - -Start with Phase 1. Only add Phase 2 / 3 if Phase 1 results need improvement. -See README.rst for the progression path. - - -## Step 1: Inspect output artifacts - -After each run, check these locations inside `out_dir`: - -| Artifact | What to look for | -|---|---| -| `ordinance_files/*.pdf` | Are these on-target documents? | -| `cleaned_text/*.txt` | Does page text contain target technology language? | -| `jurisdiction_dbs/*.csv` | Are feature rows present? Are values correct? | -| `quantitative_ordinances.csv` and `qualitative_ordinances.csv` | Final compiled output — check feature coverage and null rate | -| `logs//*.log` | Error messages, 0-document warnings | - -Minimum passing state for a smoke run: -- At least one `ordinance_files/` PDF per jurisdiction. -- At least one `cleaned_text/` file per jurisdiction. -- Compiled ordinance CSV outputs contain rows for most jurisdictions. - -Immediate fail conditions (fix before any tuning): -- Jurisdiction CSV header mismatch (must include at least `County,State`). -- Plugin configuration exceptions in logs (for example missing required - `heuristic_keywords` lists). -- `Number of jurisdictions with extracted data: 0`. - - -## Step 2: Classify the failure - -Use this decision tree for any defect: - -``` -Is the right document being retrieved? - └─ No → retrieval problem → fix query templates / heuristic_keywords - └─ Yes - Is the document text present in cleaned_text/? - └─ No → text extraction problem → check PDF quality / OCR - └─ Yes - Are the right features being extracted? - └─ No, wrong feature names → schema enum or description problem - └─ No, off-domain features → scope bleed → fix extraction_system_prompt - └─ Yes, but wrong values/units → schema description or units problem - └─ Yes, but nulls where values should be → schema IGNORE clause too broad -``` - - -## Step 3: Fix the right axis - -### Retrieval problems (wrong or missing documents) - -Fix in plugin YAML: -- Add more specific `query_templates` with legal code terms - (e.g., `"filetype:pdf {jurisdiction} generator zoning code"`). -- Add target technology terms to `GOOD_TECH_KEYWORDS` and - `GOOD_TECH_PHRASES`. -- Add adjacent-technology terms being confounded to `NOT_TECH_WORDS`. -- Increase `website_keywords` score for the most discriminating terms. - -Required `heuristic_keywords` keys when present: -- `GOOD_TECH_KEYWORDS` -- `GOOD_TECH_PHRASES` -- `GOOD_TECH_ACRONYMS` -- `NOT_TECH_WORDS` - -### Scope bleed (off-domain features extracted) - -Fix in plugin YAML `extraction_system_prompt`: -- State explicitly what is excluded (e.g., "Do not extract requirements for - residential portable generators"). -- Add the same language to `$instructions.scope` in the schema for - reinforcement. - -### Wrong values or units - -Fix in schema JSON, in the affected feature's `description`: -- Add or sharpen the `VALUE` rule. -- Expand the `UNITS` vocabulary list. -- Add a `IGNORE` clause for the near-miss case. - -### Missing values (nulls where data exists) - -Fix in schema JSON: -- Broaden the feature description to cover the phrasing used in source - documents. -- Remove overly restrictive IGNORE clauses. -- Check that the feature ID is spelled exactly as it appears in the enum. - -### Text extraction failures (blank cleaned_text) - -- Verify the PDF is readable (not scanned without OCR). -- Add `from_ocr: true` to the doc entry in `known_local_docs`. -- Set `pytesseract_exe_fp` in run config if OCR is needed. - - -## Iteration hygiene - -- Use a **unique `out_dir`** per iteration run. COMPASS aborts early if the - output directory already contains results. -- Keep the same small jurisdiction sample across all iterations until - quality gates pass. -- Record what changed and why in a short comment in the config file or - a separate `CHANGELOG.md` in the example folder. -- Save schema versions as `_schema_v2.json` etc. to - preserve a diff history. Point `schema:` in plugin YAML to the active - version. - - -## Scale-up protocol - -Only advance to the next phase when the current phase passes all gates. - -| Phase | Jurisdictions | Gates | -|---|---|---| -| Smoke | 1–3 | Output rows exist; feature names match schema enum; section/summary present for most rows | -| Robustness | 10–25 | Feature value types are stable; null rate is explainable; no scope bleed | -| Production | Full national set | False positive/negative rates acceptable; repeated runs show minimal drift | - -When advancing, keep the same config files. Only change the jurisdictions CSV. - - -## Diagnostic commands - -```bash -# Check if cleaned text was produced -ls outputs/*/cleaned_text/ - -# Count output rows per jurisdiction -wc -l outputs/*/jurisdiction_dbs/*.csv - -# Check for scope bleed — feature values that are off-domain -grep -v "diesel\|generator\|backup\|emergency" outputs/ordinances.csv | head -20 - -# View logs for a specific jurisdiction -cat outputs/logs/San\ Diego*/run.log | grep -i "error\|warning\|found 0" -``` - - -## Common failure modes - -| Symptom | Most likely cause | Fix axis | -|---|---|---| -| 0 documents for all jurisdictions | Credentials not loaded / search API down | Load `.env`; use `known_doc_urls` | -| Downloaded PDFs are from wrong domain | `query_templates` too generic | Narrow queries with `filetype:pdf` and legal code terms | -| `cleaned_text` present but no output CSV rows | Schema enum mismatch or extraction prompt failing | Check schema path in plugin YAML; verify `tech` value in run config | -| Off-domain feature names in output | Scope bleed from large land-use code | Add exclusion language to `extraction_system_prompt` | -| Correct features but wrong values | Feature description lacks VALUE rule | Add explicit VALUE rule to affected descriptions | -| Setback in wrong units | UNITS rule missing or implicit | Add explicit UNITS vocabulary to description | -| Null rows for features that are in the document | IGNORE clause too broad, or feature description doesn't match source phrasing | Broaden description; remove over-strict IGNORE clause | -| Playwright timeout errors in logs | Website crawl phase browser failure | Non-fatal; COMPASS continues. Use `known_doc_urls` while iterating | - - -## Acceptance criteria before promotion - -A technology is ready to promote from `examples/` to -`compass/extraction//` when all of the following are true on the -robustness run (10–25 jurisdictions): - -- [ ] Output CSV rows conform to required schema contract. -- [ ] Feature IDs are stable and match the schema enum exactly. -- [ ] Most non-null rows include a useful `section` and `summary`. -- [ ] Repeated runs on the same sample show minimal drift. -- [ ] No scope bleed (off-domain features) is observed. -- [ ] Null rate for common features is explainable (jurisdiction has no rule). diff --git a/.github/skills/plugin-config-setup/SKILL.md b/.github/skills/plugin-config-setup/SKILL.md deleted file mode 100644 index 57ec663d..00000000 --- a/.github/skills/plugin-config-setup/SKILL.md +++ /dev/null @@ -1,277 +0,0 @@ ---- -name: plugin-config-setup -description: Author and tune one-shot plugin YAML for COMPASS document discovery, filtering, and text collection. Use whenever a user asks to create, clean up, standardize, or troubleshoot one-shot plugin YAML for technology onboarding. ---- - -# YAML Setup Skill - -**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. -For legacy decision-tree extraction, consult COMPASS architecture docs. - -Use this skill to create or tune one-shot plugin YAML that controls retrieval, -filtering, and text collection behavior. - -## When to use - -- New technology onboarding in one-shot extraction (NOT decision-tree extraction). -- Schema exists but source relevance is weak. -- You need reproducible config handoff across teams. - -## Do not use - -- Legacy decision-tree parser implementation changes. -- Schema feature semantics work that belongs in `_schema.json`. -- Run-result diagnosis after outputs are generated (use iteration loop skill). - -## Expected assistant output - -When using this skill, return: - -1. The finalized plugin YAML content or exact diff. -2. Any required paired run-config changes. -3. A validation command and pass/fail checks for the edited YAML. - -## Canonical reference - -Consult the working examples in `examples/`: -- `examples/one_shot_schema_extraction/plugin_config.yaml` — standard working example -- `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-doc edge case - -When creating new tech configs, `_plugin_config.yaml` is the recommended -naming convention (e.g. `geothermal_plugin_config.yaml`). The existing -`plugin_config.yaml` examples use a generic name; new tech-specific assets -should use the tech-first naming pattern. - -Refer to any complete example in `examples/` that matches your retrieval goals. - -## Naming convention - -Use tech-first file names when creating new one-shot assets: -`_config*.json5`, `_plugin_config.yaml`, -`_schema.json`, `_jurisdictions*.csv`. - -## Secret handling - -Keep API keys in environment variables (for example `SERPAPI_KEY`, -`AZURE_OPENAI_API_KEY`) rather than in plugin or run config files. -Load them per shell session with `set -a && source .env && set +a`. -Avoid spaces around `=` in `.env` assignments. - -## Required minimum - -```yaml -schema: ./my_schema.json -``` - -## Non-negotiable runtime constraints - -- Jurisdiction CSV headers are case-sensitive: use `County,State`. -- If `heuristic_keywords` is present, it must include all four lists and - none may be empty. -- A run is not considered passing if logs show config errors or if - extracted jurisdiction count is zero. - -## Key plugin YAML fields - -| Field | Type | Behavior | -|---|---|---| -| `schema` | string (path) | **Required.** Path to JSON schema file, relative to plugin YAML location. | -| `data_type_short_desc` | string | Short description used in LLM prompts (e.g. `utility-scale ordinance`). | -| `query_templates` | list | Search query templates; `{jurisdiction}` is replaced at runtime. | -| `website_keywords` | dict | Keyword → score map for URL ranking during website crawl. | -| `heuristic_keywords` | dict or `true` | Pre-LLM text filter. If `true`, LLM generates lists from schema. | -| `collection_prompts` | list or `true` | Text collection prompt(s). If **`true`**, LLM auto-generates from schema. | -| `text_extraction_prompts` | list or `true` | Text consolidation prompt(s). If **`true`**, LLM auto-generates from schema. | -| `extraction_system_prompt` | string | Overrides default LLM system prompt for the extraction step. Use this to scope extraction tightly to the target technology. | -| `cache_llm_generated_content` | bool | Cache LLM-generated `query_templates`, `website_keywords`, and `heuristic_keywords`. Set to `false` when iterating schema to see live changes. | - -## Required `heuristic_keywords` shape - -When using `heuristic_keywords`, use these four lists to guide pre-LLM filtering: -- `GOOD_TECH_KEYWORDS` — strong indicators of the target technology - (e.g., facility types, deployment modes). Documents matching even a - few keywords are marked as candidates. -- `GOOD_TECH_PHRASES` — multi-word phrases that signal relevant - ordinance content. Keep specific to avoid false positives. -- `GOOD_TECH_ACRONYMS` — industry-standard abbreviations for the - technology. Narrow list; include only widely recognized acronyms. -- `NOT_TECH_WORDS` — pre-heuristic filter that rejects documents - before keyword matching. Use to exclude adjacent technologies and - irrelevant domains (e.g., residential HVAC, unrelated industries). - Runs first; prevents wasted keyword evaluation on clearly-wrong - documents. - -Use this exact structure when defining `heuristic_keywords`: - -```yaml -heuristic_keywords: - GOOD_TECH_KEYWORDS: - - "" - GOOD_TECH_PHRASES: - - "" - GOOD_TECH_ACRONYMS: - - "" - NOT_TECH_WORDS: - - "" -``` - -Notes: -- Keys are normalized, but using canonical key names reduces mistakes. -- All four lists are required and must be non-empty. - -### `collection_prompts: true` and `text_extraction_prompts: true` - -Setting either flag to `true` (not a list) instructs COMPASS to use the LLM -to auto-generate the prompts from the schema content. This is the recommended -shortcut during development — do not write manual prompt lists until -auto-generated ones prove insufficient. - -### `extraction_system_prompt` - -This is the primary control for preventing scope bleed from generic land-use -code documents. Write it as a multi-line YAML literal block: - -```yaml -extraction_system_prompt: |- - You are a legal scholar extracting structured data from - utility-scale ordinances. - - Extract only enacted requirements for utility-scale facilities. - Exclude adjacent technologies and non-target use cases. - Prefer explicit values. Use null for qualitative obligations. -``` - -See `compass/extraction/ghp/plugin_config.yaml` for a complete example. - -## Progressive config path - -1. **Minimal** - - Confirm schema path and extraction invocation work. -2. **Simple** - - Add `query_templates`, `heuristic_keywords`, and `cache_llm_generated_content`. -3. **Full** - - Add `extraction_system_prompt` if scope bleed or off-domain extraction - is observed. - - Set `collection_prompts: true` and `text_extraction_prompts: true` to - let the LLM auto-generate prompts from the schema. - - Replace `heuristic_keywords: true` with an explicit list if precision - is insufficient. - -Use the same progression for any technology. - -## Baseline YAML pattern - -```yaml -schema: ./my_schema.json -data_type_short_desc: utility-scale ordinance -cache_llm_generated_content: true -query_templates: - - "filetype:pdf {jurisdiction} ordinance" - - "{jurisdiction} zoning ordinance" - - "{jurisdiction} permitting requirements" -website_keywords: - pdf: 92160 - : 46080 - ordinance: 23040 - zoning: 2880 - permit: 1440 -heuristic_keywords: - GOOD_TECH_KEYWORDS: - - "" - - "" - GOOD_TECH_ACRONYMS: - - "" - GOOD_TECH_PHRASES: - - "" - - "" - NOT_TECH_WORDS: - - "" - - "" -``` - -Swap vocabulary for any technology while keeping the same structure. - -## Stable development mode - -Use run-config controls for deterministic smoke tests while iterating schema: - -- `known_doc_urls` or `known_local_docs` — bypass live search -- `perform_se_search: false` — disable search-engine phase -- `perform_website_search: false` — disable website crawl phase - -Re-enable search only after extraction quality is stable on known documents. - -Recommended baseline: use dynamic search first, then use deterministic mode -if search infrastructure fails. - -## Minimal run-config contract (to pair with plugin YAML) - -Use this pattern and require users to provide their own model and client -values: - -```json5 -{ - out_dir: "./outputs__", - tech: "", - jurisdiction_fp: "./_jurisdictions.csv", - perform_se_search: true, - perform_website_search: false, - model: [ - { - name: "", - llm_call_kwargs: { temperature: 0, timeout: 600 }, - client_kwargs: { - api_version: "", - azure_endpoint: "" - } - } - ] -} -``` - -## Acquisition phases - -COMPASS acquisition runs in two sequential phases per jurisdiction: - -1. **Search-engine phase** — uses `SerpAPIGoogleSearch` or similar; driven by - `query_templates`. -2. **Website crawl phase** — crawls the jurisdiction's main website using - `website_keywords` for ranking. Playwright browser errors during this - phase are **non-fatal**; COMPASS logs them and moves on. - -`perform_website_search: false` skips phase 2. Use it during smoke tests to -keep run time short and avoid Playwright dependency issues. - -## Validation checklist - -- Schema path resolves from runtime working directory. -- Query templates include `{jurisdiction}` consistently. -- URL weights favor legal and government documents. -- Heuristic exclusions are precise and not over-broad. -- Prompt overrides are only added when default behavior fails. - -## Cross-tech adaptation checklist - -When adapting to another technology: - -- replace vocabulary in `query_templates` and `website_keywords`, -- keep legal-code terms (`ordinance`, `zoning`, `code of ordinances`), -- keep non-target exclusions explicit in `NOT_TECH_WORDS`, -- do not carry terms from a previous technology into new tech configs, -- write a technology-specific `extraction_system_prompt`. - -## Run command - -```bash -pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v -``` - -If running outside the tech folder, use absolute paths for `-c` and `-p`. - -## Guardrails - -- Retrieval behavior belongs in plugin YAML. -- Feature logic belongs in schema. -- Adjust one tuning axis per run for clean attribution. -- Keep one canonical plugin file per technology in the active example folder. - diff --git a/.github/skills/schema-creation/SKILL.md b/.github/skills/schema-creation/SKILL.md deleted file mode 100644 index 08981132..00000000 --- a/.github/skills/schema-creation/SKILL.md +++ /dev/null @@ -1,178 +0,0 @@ ---- -name: schema-creation -description: Author and iterate one-shot extraction schemas for native COMPASS. Use whenever a user asks to create, expand, or debug schema feature definitions, value/unit rules, or extraction instructions. ---- - -# Schema Creation Skill - -**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction -(new technology onboarding with JSON schema + plugin YAML). For legacy decision-tree -extraction (existing solar/wind/small-wind in `compass/extraction//`), -consult COMPASS architecture docs. - -Use this skill to define what the LLM extracts and how it formats results. -The schema is the single most important config file for output quality. - -## When to use - -- Starting a new one-shot technology extraction (NOT decision-tree legacy extraction). -- Fixing inconsistent or incorrect extracted values in one-shot extraction. -- Adding new features to an existing one-shot extraction. - -## Do not use - -- Retrieval tuning tasks that belong in plugin YAML. -- Legacy decision-tree extraction parser implementation. - -## Expected assistant output - -When using this skill, return: - -1. The proposed schema diff (or full schema block) for the targeted features. -2. The rationale for VALUE, UNITS, and IGNORE wording. -3. A smoke-test check plan for validating the schema change. - -## Canonical reference - -For complete examples, see the `examples/` directory: -- `examples/one_shot_schema_extraction/wind_schema.json` -- `examples/water_rights_demo/one-shot/water_rights_schema.json5` - -Each follows the pattern: `_schema.json` or `_schema.json5`. - -## Required output contract - -Every schema must define `outputs` as an array. Each item must require -exactly these five fields and set `additionalProperties: false`: - -```json -{ - "type": "object", - "required": ["outputs"], - "additionalProperties": false, - "properties": { - "outputs": { - "type": "array", - "items": { - "type": "object", - "required": ["feature", "value", "units", "section", "summary"], - "additionalProperties": false, - "properties": { - "feature": { "type": "string", "enum": ["..."] }, - "value": { "anyOf": [{"type": "number"}, {"type": "string"}, {"type": "boolean"}, {"type": "array", "items": {"type": "string"}}, {"type": "null"}] }, - "units": { "type": ["string", "null"] }, - "section": { "type": ["string", "null"] }, - "summary": { "type": ["string", "null"] } - } - } - } - } -} -``` - -These five fields map directly to the output CSV columns. COMPASS adds -`county`, `state`, `FIPS`, and other metadata columns automatically. - -## Build sequence - -1. **Define the feature enum** — one stable lowercase ID per siting-relevant - requirement. Keep naming consistent across iterations and group IDs by - family (setbacks, noise, zoning, permitting). -2. **Define `value` and `units` rules per feature family** — in each - feature's `description`, state the expected value type and accepted unit - vocabulary explicitly. -3. **Add `$definitions`** — group related feature descriptions here to keep - the `feature` enum block clean. -4. **Add `$instructions`** — encode global extraction policy (scope, null - handling, one-row-per-feature contract, verbatim quote preference). -5. **Smoke-test on one jurisdiction** — validate all enum items appear in - output and null rows are correctly populated for missing features. - -## Feature definition template - -Every feature description must answer four questions: - -1. **What is this?** One sentence identifying the regulatory concept. -2. **VALUE rule:** What type is the value and what specific values/ranges are - valid? -3. **UNITS rule:** What unit string is accepted, or `null` if not applicable? -4. **IGNORE / CLARIFICATION:** What near-miss concepts must NOT match this - feature? - -Example (abbreviated): - -```json -"structure setback": { - "description": "Minimum distance from the generator to an occupied building. VALUE: numerical distance. UNITS: 'feet' or 'meters'. IGNORE: setbacks from property lines or roads — those are separate features." -} -``` - -## Feature family taxonomy - -Organize `$definitions` by these families: - -| Family | Example features | -|---|---| -| Setbacks | `structure setback`, `property line setback`, `road setback` | -| Noise/Emissions | `noise limit`, `emissions standard`, `vibration limit` | -| Operational | `hours of operation` | -| Physical design | `screening requirement`, `enclosure requirement`, `exhaust stack height` | -| Zoning | `primary use districts`, `conditional use districts`, `prohibited use districts` | -| Permitting | `permit requirement`, `capacity threshold` | -| Compliance | `decommissioning` | - -## `$instructions` block - -Always include a `$instructions` object at the top level with these keys: - -```json -"$instructions": { - "scope": "Describe exactly what to extract and what to ignore.", - "null_handling": "Output every enum feature. Use null value and null summary when a feature is not found in the document. Do not omit features.", - "verbatim_quotes": "In summary fields, prefer verbatim quotes from the source. Enclose in double quotation marks.", - "units_discipline": "Do not convert units. Record them exactly as they appear in the document." -} -``` - -## Scope bleed control - -When COMPASS retrieves a large land-use code instead of a tech-specific -ordinance, the LLM may extract off-domain provisions. - -Fix order (most powerful first): -1. `extraction_system_prompt` in plugin YAML — state explicitly what is in - scope and what is excluded. -2. `$instructions.scope` in schema — reinforce with exclusion language. -3. `heuristic_keywords.NOT_TECH_WORDS` — reject documents upstream. - -Do not expand the feature enum to absorb scope bleed. Narrow the prompt. - -## Cross-technology adaptation checklist - -When cloning a schema for a new technology: - -- [ ] Replace all feature IDs with technology-specific names. -- [ ] Replace value/units rules in every feature description. -- [ ] Replace exclusion terms in `$instructions.scope` and feature IGNORE - clauses. -- [ ] Replace `$definitions` group names to match new feature families. -- [ ] Smoke-test before widening to 10+ jurisdictions. - -## Quality checklist - -- [ ] Feature enum uses stable, consistent IDs across all runs. -- [ ] Every feature description contains VALUE, UNITS, and IGNORE clauses. -- [ ] `$instructions` block is present with all five keys. -- [ ] `additionalProperties: false` is set on the top-level object and on - each item in the `outputs` array. -- [ ] Schema validates cleanly against a JSON Schema validator. -- [ ] A smoke run using this schema produces extracted rows (not just - successful process exit logs). - -## Anti-patterns to avoid - -- Feature IDs that change names between iterations. -- Implicit unit assumptions not stated in description text. -- Missing IGNORE clauses for common near-miss features. -- Examples in descriptions that contradict field rules. -- Widening the enum to absorb scope bleed instead of tightening the prompt. From dab97ff8b202357a95279e4b1e06f19f0dba1b73 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:06:32 -0600 Subject: [PATCH 10/21] fixed schema --- .../geothermal_schema.json | 35 +++++-------------- 1 file changed, 8 insertions(+), 27 deletions(-) diff --git a/compass/extraction/geothermal_electricity/geothermal_schema.json b/compass/extraction/geothermal_electricity/geothermal_schema.json index 6de8d44f..46c658ab 100644 --- a/compass/extraction/geothermal_electricity/geothermal_schema.json +++ b/compass/extraction/geothermal_electricity/geothermal_schema.json @@ -23,9 +23,9 @@ "properties": { "feature": { "type": "string", - "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs.", + "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs exactly as written. Do not invent aliases, prefixes, or synonym variants; for example, use 'residential zones setback' and never 'structures residential zones setback'.", "enum": [ - "structures residential zones setback", + "residential zones setback", "property lines setback", "roads setback", "railroads setback", @@ -52,8 +52,7 @@ "permit requirement", "bond requirement", "decommissioning", - "prohibitions", - "ordinance date" + "prohibitions" ] }, "value": { @@ -78,7 +77,7 @@ }, "units": { "type": ["string", "null"], - "description": "Units for the extracted value. Preserve ordinance wording when possible. For setbacks and height, use linear units such as 'feet' or 'meters' as stated. For minimum lot size, use area units such as 'acres' or 'square feet'. For noise, use 'dBA' only if the ordinance says 'dBA' or 'dB(A)'; if it says 'dB' without A-weighting, keep 'dB'. Use null for district lists, permit lists, dates, and qualitative requirements without measurable units. For drilling-hours strings, use null unless the ordinance states a measurable hour cap rather than an operating window." + "description": "Units for the extracted value. Use canonical units for interoperability. For setbacks and height, use 'feet' or 'meters'. For minimum lot size, use 'acres' or 'square feet'. For noise, normalize A-weighted variants such as 'dB(A)' or 'dBA' to 'dBA'; keep plain 'dB' only when the text is explicitly not A-weighted. Preserve verbatim ordinance wording in summary while keeping units standardized in this field. Use null for district lists, permit lists, dates, and qualitative requirements without measurable units. For drilling-hours strings, use null unless the ordinance states a measurable hour cap rather than an operating window." }, "section": { "type": ["string", "null"], @@ -123,7 +122,7 @@ "setback_features": { "description": "Setback features for geothermal electricity facilities and related infrastructure. Treat each setback feature independently and do not cross-apply a setback unless the ordinance text explicitly states that it applies to multiple target types. When a single clause explicitly lists multiple target types and one shared numeric setback, emit one row per explicitly listed feature using the same numeric value and units and cite the same clause in summary. Apply the shared numeric prioritization rules in $core_principles when multiple numeric values explicitly apply to the same feature.", "properties": { - "structures residential zones setback": { + "residential zones setback": { "description": "Minimum required separation from structures, dwellings, occupied buildings, residences, homes, residential receptors, residential uses, or residential zoning districts. Extract this feature only when the ordinance explicitly ties the setback to structures, residences, occupied buildings, or residential zones. If one clause applies a common setback to multiple structure-like receptors such as homes, occupied buildings, and residential districts, keep one row under this feature and preserve the exact receptor list in summary. Do not map generic property line or district boundary setbacks here unless the text explicitly names structures or residential zones." }, "property lines setback": { @@ -162,7 +161,7 @@ "description": "Non-setback numerical restriction features. Only extract if numerical values are explicitly given in the ordinance text.", "properties": { "noise": { - "description": "Extract maximum allowable operational noise for geothermal electricity facilities only when an explicit numeric limit is stated. Keep units exactly as written in the text. If the ordinance only references compliance with external standards or provides no numeric noise limit, omit this feature entirely." + "description": "Extract maximum allowable operational noise for geothermal electricity facilities only when an explicit numeric limit is stated. Normalize A-weighted units to 'dBA' in units and preserve verbatim wording in summary. If the ordinance only references compliance with external standards or provides no numeric noise limit, omit this feature entirely." }, "maximum height": { "description": "Extract maximum allowed structure, drill rig, stack, tower, cooling equipment, or facility height only when an explicit numeric cap is stated. If multiple height caps apply to the same geothermal electricity feature, keep the lowest maximum and list the alternatives in summary." @@ -233,14 +232,6 @@ "description": "Extract currently effective bans, moratoria, or explicit prohibitions on geothermal electricity exploration, drilling, well development, plant construction, or facility siting. Include fracking or hydraulic-fracturing bans only when the ordinance explicitly uses them to ban, limit, or condition geothermal electricity activity. If there are carve-outs, exceptions, or conditional permitting paths that still allow the project, do not classify the rule as a prohibition." } } - }, - "date_features": { - "description": "Effective, enacted, amended, or modified date information for the operative ordinance text.", - "properties": { - "ordinance date": { - "description": "Extract the ordinance enactment, adoption, amendment, or last-modified date when the source text explicitly states it. Use a string value that preserves the ordinance date wording exactly as written, and use units=null. Prefer the current operative enactment or amendment date over superseded historical dates unless the ordinance text clearly indicates multiple active dates are relevant." - } - } } }, "$examples": [ @@ -291,14 +282,6 @@ "section": "Section 9.3 - Financial Assurance", "summary": "'Prior to permit issuance, the operator shall provide financial assurance in a form acceptable to the state oil, gas, and geothermal agency in an amount sufficient to cover plugging, abandonment, reclamation, and decommissioning costs as determined by the agency engineer.'", "explanation": "The clause imposes an enforceable financial assurance requirement but leaves the amount to an agency-determined formula, so it fits 'bond requirement' with value and units set to null." - }, - { - "feature": "ordinance date", - "value": "Amended April 12, 2024", - "units": null, - "section": "Ord. No. 2024-11", - "summary": "'Ord. No. 2024-11, amended April 12, 2024.'", - "explanation": "The excerpt explicitly states the operative amendment date for the ordinance, so it matches 'ordinance date'." } ] } @@ -307,7 +290,9 @@ "general": [ "Use direct text excerpts and quotes in summary whenever possible.", "Each feature may appear at most once in outputs; do not emit multiple rows for the same feature. If multiple ordinance lines map to one feature, build a temporary map keyed by feature, aggregate all evidence clauses under that feature key, consolidate into one row, and keep the controlling most restrictive value in value while listing alternatives in summary.", + "Feature IDs are strict canonical keys. Do not output aliases, prefixed variants, or paraphrased feature names not present in the enum.", "For any numeric feature, the summary must support the same requirement that produced value and units for that row. Never pair a numeric value from one clause with qualitative-only language from another clause that has no numeric threshold.", + "Standardize units in the units field using this schema's canonical vocabulary, while preserving ordinance-specific wording in summary.", "Summary is the primary data carrier for all features in this schema; every row must have a non-null, non-empty string for summary.", "Every row must include an explanation that briefly justifies why the cited summary evidence matches the selected feature under this schema's rules.", "Emit only positively matched features. Never emit a row to explain why a feature does not apply.", @@ -351,10 +336,6 @@ "Distinguish between complete prohibition and conditional permitting. Conditional permitting is not a ban.", "Do not treat ordinary operational, environmental, design, monitoring, or permit conditions as prohibitions when the ordinance still allows the project to proceed subject to compliance.", "A fracking ban belongs here only when the ordinance explicitly uses it to regulate geothermal electricity development." - ], - "dates": [ - "For 'ordinance date', preserve the date wording exactly as written in the ordinance text and use units=null.", - "Prefer the current operative enacted, amended, or modified date over historical prior dates unless the source text clearly indicates multiple active dates are relevant to the current ordinance provision." ] }, "$qa_checklist": [ From c6eb335a4609d88eea04104481ab926bed01cb84 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:19:26 -0600 Subject: [PATCH 11/21] Refactor URL sanitization: shared helper, space-only encoding, robust test (#401) * Initial plan * Extract shared _sanitize_url to url_utils.py, simplify to space-only encoding, fix test robustness Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> Agent-Logs-Url: https://github.com/NatLabRockies/COMPASS/sessions/ceb782b4-c312-41d1-b4eb-eccbbef67097 * fix failing test * ruff error fix --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> --- compass/scripts/download.py | 25 +---------------- compass/web/url_utils.py | 12 ++++++++ compass/web/website_crawl.py | 37 ++----------------------- tests/python/unit/web/test_web_crawl.py | 1 + 4 files changed, 16 insertions(+), 59 deletions(-) create mode 100644 compass/web/url_utils.py diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 9199eed8..213ada92 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -2,14 +2,6 @@ import logging from contextlib import AsyncExitStack -from urllib.parse import ( - parse_qsl, - quote, - unquote, - urlencode, - urlparse, - urlunparse, -) from elm.web.document import PDFDocument from elm.web.search.run import ( @@ -32,6 +24,7 @@ JurisdictionWebsiteValidator, ) from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer +from compass.web.url_utils import _sanitize_url from compass.utilities.enums import LLMTasks from compass.utilities.io import load_local_docs from compass.pb import COMPASS_PB @@ -804,22 +797,6 @@ async def _contains_relevant_text( return found_text -def _sanitize_url(url): - """Percent-encode spaces and unsafe characters in a URL path""" - parsed = urlparse(url) - safe_path = quote(unquote(parsed.path), safe="/") - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - return urlunparse(( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - )) - - def _sanitize_doc_sources(docs): """Rewrite source attrs on documents returned by ELMWebsiteCrawler diff --git a/compass/web/url_utils.py b/compass/web/url_utils.py new file mode 100644 index 00000000..2eeb464d --- /dev/null +++ b/compass/web/url_utils.py @@ -0,0 +1,12 @@ +"""Shared URL utilities for COMPASS web modules""" + +from urllib.parse import quote, urlsplit, urlunsplit + + +def _sanitize_url(url): + """Encode unsafe URL characters while preserving URL semantics""" + parsed = urlsplit(url) + path = quote(parsed.path, safe="/:@-._~!$&'()*+,;=") + query = quote(parsed.query, safe="=&;%:@-._~!$&'()*+,;/?:") + fragment = quote(parsed.fragment, safe="") + return urlunsplit((parsed.scheme, parsed.netloc, path, query, fragment)) diff --git a/compass/web/website_crawl.py b/compass/web/website_crawl.py index a5fd2093..ccda23ff 100644 --- a/compass/web/website_crawl.py +++ b/compass/web/website_crawl.py @@ -9,15 +9,7 @@ import operator from collections import Counter from contextlib import AsyncExitStack -from urllib.parse import ( - urlparse, - urlunparse, - quote, - unquote, - parse_qsl, - urlencode, - urljoin, -) +from urllib.parse import urljoin from crawl4ai.models import Link as c4AILink from bs4 import BeautifulSoup @@ -28,6 +20,7 @@ from elm.web.document import PDFDocument, HTMLDocument from elm.web.file_loader import AsyncWebFileLoader from elm.web.website_crawl import ELMLinkScorer, _SCORE_KEY # noqa: PLC2701 +from compass.web.url_utils import _sanitize_url logger = logging.getLogger(__name__) @@ -495,32 +488,6 @@ def _debug_info_on_links(links): logger.debug(" ...") -def _sanitize_url(url): - """Fix common URL issues - - - Encode spaces and unsafe characters in the path - - Encode query parameters safely - - Leave existing percent-encoding intact - """ - parsed = urlparse(url) - - safe_path = quote(unquote(parsed.path), safe="/") - - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - - return urlunparse( - ( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - ) - ) - - def _extract_links_from_html(text, base_url): """Parse HTML and extract all links""" soup = BeautifulSoup(text, "html.parser") diff --git a/tests/python/unit/web/test_web_crawl.py b/tests/python/unit/web/test_web_crawl.py index 24ae0e61..30dd2a86 100644 --- a/tests/python/unit/web/test_web_crawl.py +++ b/tests/python/unit/web/test_web_crawl.py @@ -267,6 +267,7 @@ def test_extract_links_from_html_sets_text_from_anchor(): Permit Standards """ links = _extract_links_from_html(html, base_url="https://example.com") + assert len(links) == 1 link = next(iter(links)) assert link.title == "Permit Standards" assert link.text == "Permit Standards" From 93ad791545e211f2d6e3ac155ceff87ea66bee48 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Mar 2026 11:23:08 -0600 Subject: [PATCH 12/21] Bump release-drafter/release-drafter from 7.0.0 to 7.1.1 (#402) Bumps [release-drafter/release-drafter](https://github.com/release-drafter/release-drafter) from 7.0.0 to 7.1.1. - [Release notes](https://github.com/release-drafter/release-drafter/releases) - [Commits](https://github.com/release-drafter/release-drafter/compare/3a7fb5c85b80b1dda66e1ccb94009adbbd32fce3...139054aeaa9adc52ab36ddf67437541f039b88e2) --- updated-dependencies: - dependency-name: release-drafter/release-drafter dependency-version: 7.1.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/release_drafter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release_drafter.yml b/.github/workflows/release_drafter.yml index 91ffb00f..173452ba 100644 --- a/.github/workflows/release_drafter.yml +++ b/.github/workflows/release_drafter.yml @@ -11,6 +11,6 @@ jobs: runs-on: ubuntu-latest steps: - name: Release Drafter - uses: release-drafter/release-drafter@3a7fb5c85b80b1dda66e1ccb94009adbbd32fce3 # v7.0.0 + uses: release-drafter/release-drafter@139054aeaa9adc52ab36ddf67437541f039b88e2 # v7.1.1 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From a0fbd553679b354fb8625c70224848b4a950e833 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Thu, 26 Mar 2026 16:56:50 -0600 Subject: [PATCH 13/21] Add One-Shot Skills Reliability and Guardrails (#397) * Add COMPASS workflow skills * Added one-shot skills * update one-shot SKILL.md structure and trigger contracts * Initial plan (#398) Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> * Fix skills documentation: correct paths, caching behavior, and tab formatting (#399) * Initial plan * Fix all review comments in skills documentation Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> * renamed skills and fixed minor comments * udpated skills Paul review march 26 --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> --- .github/skills/document-retrieval/SKILL.md | 173 ++++++++++++ .github/skills/extraction-run/SKILL.md | 273 +++++++++++++++++++ .github/skills/plugin-config-setup/SKILL.md | 279 ++++++++++++++++++++ .github/skills/schema-creation/SKILL.md | 178 +++++++++++++ 4 files changed, 903 insertions(+) create mode 100644 .github/skills/document-retrieval/SKILL.md create mode 100644 .github/skills/extraction-run/SKILL.md create mode 100644 .github/skills/plugin-config-setup/SKILL.md create mode 100644 .github/skills/schema-creation/SKILL.md diff --git a/.github/skills/document-retrieval/SKILL.md b/.github/skills/document-retrieval/SKILL.md new file mode 100644 index 00000000..9c077424 --- /dev/null +++ b/.github/skills/document-retrieval/SKILL.md @@ -0,0 +1,173 @@ +--- +name: document-retrieval +description: Build and tune retrieval configs that search, rank, and collect ordinance documents in COMPASS. Use whenever a user asks to improve retrieval precision/recall, tune search queries/keywords, or debug acquisition quality before extraction tuning. +--- + +# Web Scraper Skill + +Use this skill to improve retrieval precision/recall before extraction tuning. +Applies to both one-shot (schema-driven) and legacy decision-tree extraction +pipelines. + +## When to use + +- Download step returns noisy sources (one-shot extraction). +- Ordinance recall is weak across jurisdictions (one-shot extraction). +- LLM filtering is compensating for poor search quality. + +## Do not use + +- Schema feature definition or value extraction logic design. +- Post-extraction feature/value debugging when retrieval is already correct. + +## Expected assistant output + +When using this skill, return: + +1. The retrieval axis changed (queries, keyword weights, or heuristics). +2. Evidence from artifacts/logs showing why the change was needed. +3. The next run command against the same jurisdiction sample. + +## Canonical reference + +Consult example plugin configurations in `examples/`: +- `examples/one_shot_schema_extraction/plugin_config.yaml` — standard one-shot config +- `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-document edge cases + +When creating new tech configs, use `_plugin_config.yaml` as a recommended +naming convention (e.g. `geothermal_plugin_config.yaml`). + +## Scope + +- Query-template strategy. +- URL ranking and filtering patterns. +- Heuristic phrase controls before LLM validation. + +## Two retrieval phases + +COMPASS runs two sequential acquisition passes per jurisdiction: + +1. **Search-engine phase** — queries `SerpAPIGoogleSearch` (or configured + engine) using `query_templates`. This phase is the primary source of + ordinance documents. +2. **Website crawl phase** — crawls the jurisdiction's official website, + ranking pages using `website_keywords`. This phase is a secondary pass + and runs only if the search-engine phase did not yield an ordinance + context. + +Key behaviors: +- Playwright browser errors during the website crawl phase are **non-fatal**. + COMPASS logs the error and continues. +- `Found 0 potential documents` at the end of the crawl phase is **expected** + for jurisdictions without relevant online ordinances. +- Disable the crawl phase with `perform_website_search: false` in run config + when you want faster smoke tests or Playwright is unavailable. + +## Key management + +For SerpAPI-backed search, keep `api_key` out of committed config and provide +`SERPAPI_KEY` via environment (for example through `.env` loaded in shell). + +Recommended shell setup: + +```bash +set -a +source .env +set +a +``` + +Avoid spaces around `=` in `.env` assignments. + +## Retrieval design pattern + +1. Create 3-7 jurisdiction queries with `{jurisdiction}`. +2. Weight legal document indicators in URL keywords. +3. Apply exclusions for templates/reports/slides. +4. Add focused negative tech terms to reduce false positives. +5. Start with dynamic search, then switch to deterministic known URLs when + search infrastructure is unstable. + +When using `heuristic_keywords`, use these four lists to guide pre-LLM filtering: +- `GOOD_TECH_KEYWORDS` — strong indicators of the target technology + (e.g., facility types, deployment modes). Documents matching even a + few keywords are marked as candidates. +- `GOOD_TECH_PHRASES` — multi-word phrases that signal relevant + ordinance content. Keep specific to avoid false positives. +- `GOOD_TECH_ACRONYMS` — industry-standard abbreviations for the + technology. Narrow list; include only widely recognized acronyms. +- `NOT_TECH_WORDS` — pre-heuristic filter that rejects documents + before keyword matching. Use to exclude adjacent technologies and + irrelevant domains (e.g., residential HVAC, unrelated industries). + Runs first; prevents wasted keyword evaluation on clearly-wrong + documents. + +If any required list is missing or empty, COMPASS raises a plugin +configuration error and extraction quality should be treated as failed. + +For first-pass reliability, test retrieval with deterministic known URLs +before using live web search. + +## Technology-specific retrieval controls (template) + +- Include target-technology facility/deployment terms. +- Exclude adjacent and non-target terms (residential/HVAC/PV/etc as needed). +- Favor jurisdictional legal-code signals like `land use code`, + `code of ordinances`, `use table`, and `special use permit`. + +## Deterministic smoke-test mode +For this smoke test, at least one of the following documentation sources must be provided: + +- **`known_doc_urls`**: A list of URLs pointing to external documentation that the scraper can access and parse +- **`known_local_docs`**: A collection of local documentation files available in the repository or system + +Use run-config controls to bypass flaky search while tuning: + +- supply `known_doc_urls` or `known_local_docs`, +- set `perform_se_search: false`, +- set `perform_website_search: false`. + +Then validate: + +- download artifacts exist, +- cleaned text exists, +- ordinance DB rows are non-empty. + +## Tuning loop + +1. Run SE-search phase on small sample. +2. Inspect kept vs discarded PDFs (`ordinance_files/`). +3. Run heuristic filter and review false rejects/accepts (`cleaned_text/`). +4. Check website crawl phase independently if needed (enable, run, inspect logs). +5. Update one axis only: + - query templates (affects SE phase), + - URL weights (affects both phases), + - include/exclude heuristic patterns (pre-LLM filter), + - `NOT_TECH_WORDS` (upstream document rejection). +6. Re-run same sample and compare. + +## Cross-tech onboarding + +When reusing this workflow for any technology: + +- keep legal retrieval tokens (`ordinance`, `zoning`, `code`), +- replace all technology terms in `query_templates`, `website_keywords`, + and `heuristic_keywords`, +- seed `known_doc_urls` with authoritative regulatory documents for smoke + testing, +- avoid copying negatives from previous technologies into the new tech config, +- verify `NOT_TECH_WORDS` excludes adjacent technologies for your domain. + +## Phase gates + +- **3 jurisdictions**: ensure major source classes are found. +- **10 jurisdictions**: verify stability across regions. + + +## Guardrails + +- Keep feature extraction logic out of retrieval config. +- Do not overfit to one county's document style. +- Preserve auditable rationale for each retrieval change. +- Keep one canonical retrieval config per active technology. +- Ensure each run uses a unique `out_dir` to avoid COMPASS aborting early. + diff --git a/.github/skills/extraction-run/SKILL.md b/.github/skills/extraction-run/SKILL.md new file mode 100644 index 00000000..c2fafa8b --- /dev/null +++ b/.github/skills/extraction-run/SKILL.md @@ -0,0 +1,273 @@ +--- +name: extraction-run +description: Execute one-shot extraction with COMPASS and iterate quickly with low cost. Use whenever a user asks to run, smoke-test, validate, debug, or scale one-shot schema extraction for any technology. +--- + +# Extraction Run Skill + +**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. +For decision-tree extraction (solar, wind, small wind), consult COMPASS +architecture docs. + +Use this skill to run one-shot extraction in a repeatable, low-risk way, +then iterate quickly until you have stable structured outputs. + +## When to use + +- Schema exists and plugin config points to it. +- You need a reliable smoke-test workflow before scaling. +- You are NOT using decision-tree extraction. + +## Do not use + +- Decision-tree extraction feature engineering. +- Python parser implementation in `compass/extraction//parse.py`. +- Non-extraction tasks (for example docs-only updates). + +## Expected assistant output + +When using this skill, return: + +1. The exact `pixi run compass process ...` command used. +2. A pass/fail decision against extraction-quality gates. +3. The smallest next config/schema change and why. + +## Canonical reference + +- `examples/one_shot_schema_extraction/` — complete working examples +- `examples/one_shot_schema_extraction/README.rst` — general one-shot overview +- `examples/water_rights_demo/one-shot/` — multi-doc extraction example + +## Two-pipeline modes + +COMPASS supports two distinct extraction pipelines. Choose one and do not mix +them for the same technology: + +| Mode | Where code lives | Good for | +|---|---|---| +| **One-shot (schema-based)** | `examples/` → `compass/extraction//` | New techs, no Python changes | +| **decision-tree** | Python code in `compass/extraction//` | Existing solar, wind, small wind | + +One-shot is the correct path for all new technology onboarding. It requires +only a schema JSON, a plugin YAML, and a run config — no Python source changes. + +## Tech promotion lifecycle + +New technology assets start in `examples/` and finish in `compass/extraction/`: + +1. **Develop** — place all assets in `examples/one_shot_schema_extraction/` +2. **Stabilize** — iterate schema/plugin until smoke and robustness gates pass +3. **Promote** — copy the three finalized files into `compass/extraction//`: + - `_schema.json` + - `_plugin_config.yaml` + - `__init__.py` — registers the plugin via `create_schema_based_one_shot_extraction_plugin` + + After creating the package, add an import in `compass/extraction/__init__.py` + to register the plugin at startup. See `compass/extraction/ghp/__init__.py` + for a reference implementation. + +## Required inputs + +- Run config for `compass process`. +- Plugin config containing `schema`. +- API keys in environment (never hardcode in configs). +- A jurisdiction set sized to the current phase. + +## Preflight checks (must pass before run) + +- Jurisdiction CSV has headers `County,State` or `County,State,Subdivision,Jurisdiction Type`. +- `out_dir` is unique for this run. +- At least one acquisition step is enabled: + `perform_se_search: true`, `perform_website_search: true`, + `known_doc_urls`, or `known_local_docs`. +- If `heuristic_keywords` exists, all four required lists are present and + non-empty. + +## Naming convention + +Use tech-first names for all one-shot assets: + +- `_plugin_config.yaml` +- `_schema.json` +- `_jurisdictions*.csv` + +The `tech` value in the run config must be a string that becomes the plugin +registry identifier. It must be unique, lowercase, and underscore-separated +(for example `concentrating_solar`, `geothermal_electricity`). COMPASS will +raise `Unknown tech input` if this key does not match any registered plugin. + +## Canonical development pattern + +For early development, start with the proven dynamic baseline, then fall back +to deterministic mode only when search infrastructure is unstable: + +1. Use one small jurisdiction file (1-3 rows). +2. Use your preferred configured search engine. +3. Load `.env` into shell (`set -a && source .env && set +a`). +4. Run with verbose logs: + - `pixi run compass process -c config.json5 -p plugin.yaml -v` +5. Confirm output artifacts exist before tuning schema semantics. + +Fallback mode when needed: + +- Add `known_doc_urls` (or `known_local_docs`) in run config. +- Set `perform_se_search: false` and `perform_website_search: false`. + +## Adaptation rule + +When adapting this workflow for a new technology, keep the run structure +unchanged and swap only technology-specific inputs: + +- `tech` in run config, +- schema file, +- plugin descriptor (`data_type_short_desc`), +- retrieval query/keyword vocabulary, +- known document URL set. + +Change one axis per run unless debugging infrastructure failures. + +## Environment setup + +Load secrets from `.env` before running. Never commit key values in config files. + +```bash +set -a && source .env && set +a # no spaces around = in .env assignments +``` + +## Core command + +```bash +pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v +``` + +## Phase-gated workflow + +1. **Smoke test (1 jurisdiction)** + - Goal: verify wiring and output contract. +2. **Robustness (5 jurisdictions)** + - Goal: verify feature stability and edge-case handling. + +## Validation checklist + +Evaluate each run on: + +- document relevance (exclude off-domain content), +- feature coverage vs expected ordinance topics, +- section/summary traceability, +- unit consistency, +- null discipline, + +## Expected output artifacts + +A successful run produces these files under `out_dir`: + +| Artifact | Meaning | +|---|---| +| `ordinance_files/*.pdf` | Downloaded source documents | +| `cleaned_text/*.txt` | Heuristic-filtered extracted text | +| `jurisdiction_dbs/*.csv` | Per-jurisdiction raw extraction rows | +| `quantitative_ordinances.csv` | Final compiled numeric features | +| `qualitative_ordinances.csv` | Final compiled qualitative features | +| `usage.json` | Per-jurisdiction LLM token and request counts | +| `meta.json` | Run metadata (cost, timing, version) | + +Final CSV columns: `county`, `state`, `subdivision`, `jurisdiction_type`, +`FIPS`, `feature`, `value`, `units`, `adder`, `min_dist`, `max_dist`, +`summary`, `year`, `section`, `source`. + +## Interpreting output status correctly + +`cleaned_text` files can exist while `Number of documents found` is `0`. + +This means acquisition/text collection worked, but no final structured ordinance +rows were emitted into consolidated DB outputs. + +Check in order: + +1. `outputs/*/cleaned_text/*.txt` (text extraction present) +2. `outputs/*/jurisdiction_dbs/*.csv` (per-jurisdiction parsed rows) +3. `outputs/*/quantitative_ordinances.csv` and + `outputs/*/qualitative_ordinances.csv` (final compiled results) + +Treat the run as **failed for extraction quality** when either is true: +- `Number of jurisdictions with extracted data: 0` +- any configuration exception appears in logs (even if process exits 0) + +Only treat a run as passing when both are true: +- at least one jurisdiction has extracted data +- at least one jurisdiction CSV in `jurisdiction_dbs/` has more than header row + +## Root-cause triage + +- **Wrong or noisy documents** + - Tune query templates, URL keywords, and exclusions. + - Prefer `known_doc_urls` while stabilizing. +- **Right documents, wrong fields** + - Tune schema descriptions/examples and ambiguity rules. + - Check `extraction_system_prompt` in plugin YAML — it is the primary + guard against scope bleed from generic legal documents. +- **Correct values, unstable formatting** + - Tighten enums, unit vocabulary, and null behavior. +- **Nothing downloaded / unstable search** + - Disable live search and use deterministic known URLs/local docs. +- **0 documents found for a jurisdiction during website crawl** + - Expected for jurisdictions with few online ordinances. The website + crawl is a second acquisition pass after search-engine retrieval; + 0 results there is not a pipeline failure. + +## Acceptance gates + +Do not advance phases until all are true: + +- Output rows conform to required contract. +- High share of rows include useful `section` and `summary`. +- Feature names are stable and machine-consistent. +- Repeated runs on same sample show minimal drift. + +## Cost and speed controls + +- Keep sample size minimal while tuning. +- Change one variable per run. +- Archive run command, input set, and output path for each iteration. + +## Workspace hygiene (important) + +Keep one canonical working set per technology in `examples/`: + +- one run config, +- one plugin config, +- one schema, +- one jurisdiction file, +- one known docs file. + +Delete stale `_migrated`, `_smoke`, and duplicate output folders to avoid +configuration drift and debugging confusion. + +## Known infrastructure issues + +### Playwright timeouts + +Web search via `rebrowser_playwright` may fail with 60s timeouts on +`Page.wait_for_selector`. Symptoms: +- `TimeoutError: Page.wait_for_selector: Timeout 60000ms exceeded` +- All search queries fail consistently +- Browser session crashes with `ProtocolError: Internal server error, session closed` + +These errors during the **website crawl phase** (second acquisition pass) are +**non-fatal**. COMPASS logs them and continues. They do not block the +search-engine phase or extraction. + +If search itself is failing, verify provider credentials are loaded and fall +back to deterministic mode. + +**Workaround**: Use `known_local_docs` or `known_doc_urls` and disable +search/website steps while validating extraction logic. + +### known_local_docs loading failures + +`known_local_docs` may fail silently with `ERROR: Failed to read file` in +jurisdiction logs due to external loader behavior. + +**Workaround**: Prefer `known_doc_urls` for deterministic smoke tests and +pre-validate local docs before pipeline runs. + diff --git a/.github/skills/plugin-config-setup/SKILL.md b/.github/skills/plugin-config-setup/SKILL.md new file mode 100644 index 00000000..0c83b5f9 --- /dev/null +++ b/.github/skills/plugin-config-setup/SKILL.md @@ -0,0 +1,279 @@ +--- +name: plugin-config-setup +description: Author and tune one-shot plugin YAML for COMPASS document discovery, filtering, and text collection. Use whenever a user asks to create, clean up, standardize, or troubleshoot one-shot plugin YAML for technology onboarding. +--- + +# YAML Setup Skill + +**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction. +For legacy decision-tree extraction, consult COMPASS architecture docs. + +Use this skill to create or tune one-shot plugin YAML that controls retrieval, +filtering, and text collection behavior. + +## When to use + +- New technology onboarding in one-shot extraction (NOT decision-tree extraction). +- Schema exists but source relevance is weak. +- You need reproducible config handoff across teams. + +## Do not use + +- Legacy decision-tree parser implementation changes. +- Schema feature semantics work that belongs in `_schema.json`. +- Run-result diagnosis after outputs are generated (use iteration loop skill). + +## Expected assistant output + +When using this skill, return: + +1. The finalized plugin YAML content or exact diff. +2. Any required paired run-config changes. +3. A validation command and pass/fail checks for the edited YAML. + +## Canonical reference + +Consult the working examples in `examples/`: +- `examples/one_shot_schema_extraction/plugin_config.yaml` — standard working example +- `examples/water_rights_demo/one-shot/plugin_config.yaml` — multi-doc edge case + +When creating new tech configs, `_plugin_config.yaml` is the recommended +naming convention (e.g. `geothermal_plugin_config.yaml`). The existing +`plugin_config.yaml` examples use a generic name; new tech-specific assets +should use the tech-first naming pattern. + +Refer to any complete example in `examples/` that matches your retrieval goals. + +## Naming convention + +Use tech-first file names when creating new one-shot assets: +`_config*.json5`, `_plugin_config.yaml`, +`_schema.json`, `_jurisdictions*.csv`. + +## Secret handling + +Keep API keys in environment variables (for example `SERPAPI_KEY`, +`AZURE_OPENAI_API_KEY`) rather than in plugin or run config files. +Load them per shell session with `set -a && source .env && set +a`. +Avoid spaces around `=` in `.env` assignments. + +## Required minimum + +```yaml +schema: ./my_schema.json +``` + +## Non-negotiable runtime constraints + +- Jurisdiction CSV headers are case-sensitive: use `County,State`. +- If `heuristic_keywords` is present, it must include all four lists and + none may be empty. +- A run is not considered passing if logs show config errors or if + extracted jurisdiction count is zero. + +## Key plugin YAML fields + +| Field | Type | Code Reference | +|---|---|---| +| `schema` | string (path) | [base.py#L124–L131](../../../compass/plugin/one_shot/base.py) | +| `data_type_short_desc` | string | [base.py#L483](../../../compass/plugin/one_shot/base.py#L483) | +| `query_templates` | list | [base.py#L217–L240](../../../compass/plugin/one_shot/base.py#L217) | +| `website_keywords` | dict | [base.py#L281–L338](../../../compass/plugin/one_shot/base.py#L281) | +| `heuristic_keywords` | dict or `true` | [base.py#L340–L390](../../../compass/plugin/one_shot/base.py#L340); [base.py#L512](../../../compass/plugin/one_shot/base.py#L512) | +| `collection_prompts` | list or `true` | [base.py#L413–L436](../../../compass/plugin/one_shot/base.py#L413) | +| `text_extraction_prompts` | list or `true` | [base.py#L438–L468](../../../compass/plugin/one_shot/base.py#L438) | +| `extraction_system_prompt` | string | [base.py#L476–L488](../../../compass/plugin/one_shot/base.py#L476) | +| `cache_llm_generated_content` | bool | [base.py#L107–L117](../../../compass/plugin/one_shot/base.py#L107) | + +**For the complete list of all configuration options (including `allow_multi_doc_extraction` and any future additions), consult the docstring of [`create_schema_based_one_shot_extraction_plugin()`](../../../compass/plugin/one_shot/base.py#L51).** + +## Required `heuristic_keywords` shape + +When using `heuristic_keywords`, use these four lists to guide pre-LLM filtering: +- `GOOD_TECH_KEYWORDS` — strong indicators of the target technology + (e.g., facility types, deployment modes). Documents matching even a + few keywords are marked as candidates. +- `GOOD_TECH_PHRASES` — multi-word phrases that signal relevant + ordinance content. Keep specific to avoid false positives. +- `GOOD_TECH_ACRONYMS` — industry-standard abbreviations for the + technology. Narrow list; include only widely recognized acronyms. +- `NOT_TECH_WORDS` — pre-heuristic filter that rejects documents + before keyword matching. Use to exclude adjacent technologies and + irrelevant domains (e.g., residential HVAC, unrelated industries). + Runs first; prevents wasted keyword evaluation on clearly-wrong + documents. + +Use this exact structure when defining `heuristic_keywords`: + +```yaml +heuristic_keywords: + GOOD_TECH_KEYWORDS: + - "" + GOOD_TECH_PHRASES: + - "" + GOOD_TECH_ACRONYMS: + - "" + NOT_TECH_WORDS: + - "" +``` + +Notes: +- Keys are normalized, but using canonical key names reduces mistakes. +- All four lists are required and must be non-empty. + +### `collection_prompts: true` and `text_extraction_prompts: true` + +Setting either flag to `true` (not a list) instructs COMPASS to use the LLM +to auto-generate the prompts from the schema content. This is the recommended +shortcut during development — do not write manual prompt lists until +auto-generated ones prove insufficient. + +### `extraction_system_prompt` + +This is the primary control for preventing scope bleed from generic land-use +code documents. Write it as a multi-line YAML literal block: + +```yaml +extraction_system_prompt: |- + You are a legal scholar extracting structured data from + utility-scale ordinances. + + Extract only enacted requirements for utility-scale facilities. + Exclude adjacent technologies and non-target use cases. + Prefer explicit values. Use null for qualitative obligations. +``` + +See `compass/extraction/ghp/plugin_config.yaml` for a complete example. + +## Progressive config path + +1. **Minimal** + - Confirm schema path and extraction invocation work. +2. **Simple** + - Add `query_templates`, `heuristic_keywords`, and `cache_llm_generated_content`. +3. **Full** + - Add `extraction_system_prompt` if scope bleed or off-domain extraction + is observed. + - Set `collection_prompts: true` and `text_extraction_prompts: true` to + let the LLM auto-generate prompts from the schema. + - Replace `heuristic_keywords: true` with an explicit list if precision + is insufficient. + +Use the same progression for any technology. + +## Baseline YAML pattern + +```yaml +schema: ./my_schema.json +data_type_short_desc: utility-scale ordinance +cache_llm_generated_content: true +query_templates: + - "filetype:pdf {jurisdiction} ordinance" + - "{jurisdiction} zoning ordinance" + - "{jurisdiction} permitting requirements" +website_keywords: + pdf: 92160 + : 46080 + ordinance: 23040 + zoning: 2880 + permit: 1440 +heuristic_keywords: + GOOD_TECH_KEYWORDS: + - "" + - "" + GOOD_TECH_ACRONYMS: + - "" + GOOD_TECH_PHRASES: + - "" + - "" + NOT_TECH_WORDS: + - "" + - "" +``` + +Swap vocabulary for any technology while keeping the same structure. + +## Stable development mode + +Use run-config controls for deterministic smoke tests while iterating schema: + +- `known_doc_urls` or `known_local_docs` — bypass live search +- `perform_se_search: false` — disable search-engine phase +- `perform_website_search: false` — disable website crawl phase + +Re-enable search only after extraction quality is stable on known documents. + +Recommended baseline: use dynamic search first, then use deterministic mode +if search infrastructure fails. + +## Minimal run-config contract (to pair with plugin YAML) + +Use this pattern and require users to provide their own model and client +values: + +```json5 +{ + out_dir: "./outputs__", + tech: "", + jurisdiction_fp: "./_jurisdictions.csv", + perform_se_search: true, + perform_website_search: false, + model: [ + { + name: "", + llm_call_kwargs: { temperature: 0, timeout: 600 }, + client_kwargs: { + api_version: "", + azure_endpoint: "" + } + } + ] +} +``` + +## Acquisition phases + +COMPASS acquisition runs in two sequential phases per jurisdiction: + +1. **Search-engine phase** — uses `SerpAPIGoogleSearch` or similar; driven by + `query_templates`. +2. **Website crawl phase** — crawls the jurisdiction's main website using + `website_keywords` for ranking. Playwright browser errors during this + phase are **non-fatal**; COMPASS logs them and moves on. + +`perform_website_search: false` skips phase 2. Use it during smoke tests to +keep run time short and avoid Playwright dependency issues. + +## Validation checklist + +- Schema path resolves from runtime working directory. +- Query templates include `{jurisdiction}` consistently. +- URL weights favor legal and government documents. +- Heuristic exclusions are precise and not over-broad. +- Prompt overrides are only added when default behavior fails. + +## Cross-tech adaptation checklist + +When adapting to another technology: + +- replace vocabulary in `query_templates` and `website_keywords`, +- keep legal-code terms (`ordinance`, `zoning`, `code of ordinances`), +- keep non-target exclusions explicit in `NOT_TECH_WORDS`, +- do not carry terms from a previous technology into new tech configs, +- write a technology-specific `extraction_system_prompt`. + +## Run command + +```bash +pixi run compass process -c config.json5 -p path/to/plugin_config.yaml -v +``` + +If running outside the tech folder, use absolute paths for `-c` and `-p`. + +## Guardrails + +- Retrieval behavior belongs in plugin YAML. +- Feature logic belongs in schema. +- Adjust one tuning axis per run for clean attribution. +- Keep one canonical plugin file per technology in the active example folder. + diff --git a/.github/skills/schema-creation/SKILL.md b/.github/skills/schema-creation/SKILL.md new file mode 100644 index 00000000..08981132 --- /dev/null +++ b/.github/skills/schema-creation/SKILL.md @@ -0,0 +1,178 @@ +--- +name: schema-creation +description: Author and iterate one-shot extraction schemas for native COMPASS. Use whenever a user asks to create, expand, or debug schema feature definitions, value/unit rules, or extraction instructions. +--- + +# Schema Creation Skill + +**ONE-SHOT EXTRACTION ONLY.** This skill applies only to schema-driven extraction +(new technology onboarding with JSON schema + plugin YAML). For legacy decision-tree +extraction (existing solar/wind/small-wind in `compass/extraction//`), +consult COMPASS architecture docs. + +Use this skill to define what the LLM extracts and how it formats results. +The schema is the single most important config file for output quality. + +## When to use + +- Starting a new one-shot technology extraction (NOT decision-tree legacy extraction). +- Fixing inconsistent or incorrect extracted values in one-shot extraction. +- Adding new features to an existing one-shot extraction. + +## Do not use + +- Retrieval tuning tasks that belong in plugin YAML. +- Legacy decision-tree extraction parser implementation. + +## Expected assistant output + +When using this skill, return: + +1. The proposed schema diff (or full schema block) for the targeted features. +2. The rationale for VALUE, UNITS, and IGNORE wording. +3. A smoke-test check plan for validating the schema change. + +## Canonical reference + +For complete examples, see the `examples/` directory: +- `examples/one_shot_schema_extraction/wind_schema.json` +- `examples/water_rights_demo/one-shot/water_rights_schema.json5` + +Each follows the pattern: `_schema.json` or `_schema.json5`. + +## Required output contract + +Every schema must define `outputs` as an array. Each item must require +exactly these five fields and set `additionalProperties: false`: + +```json +{ + "type": "object", + "required": ["outputs"], + "additionalProperties": false, + "properties": { + "outputs": { + "type": "array", + "items": { + "type": "object", + "required": ["feature", "value", "units", "section", "summary"], + "additionalProperties": false, + "properties": { + "feature": { "type": "string", "enum": ["..."] }, + "value": { "anyOf": [{"type": "number"}, {"type": "string"}, {"type": "boolean"}, {"type": "array", "items": {"type": "string"}}, {"type": "null"}] }, + "units": { "type": ["string", "null"] }, + "section": { "type": ["string", "null"] }, + "summary": { "type": ["string", "null"] } + } + } + } + } +} +``` + +These five fields map directly to the output CSV columns. COMPASS adds +`county`, `state`, `FIPS`, and other metadata columns automatically. + +## Build sequence + +1. **Define the feature enum** — one stable lowercase ID per siting-relevant + requirement. Keep naming consistent across iterations and group IDs by + family (setbacks, noise, zoning, permitting). +2. **Define `value` and `units` rules per feature family** — in each + feature's `description`, state the expected value type and accepted unit + vocabulary explicitly. +3. **Add `$definitions`** — group related feature descriptions here to keep + the `feature` enum block clean. +4. **Add `$instructions`** — encode global extraction policy (scope, null + handling, one-row-per-feature contract, verbatim quote preference). +5. **Smoke-test on one jurisdiction** — validate all enum items appear in + output and null rows are correctly populated for missing features. + +## Feature definition template + +Every feature description must answer four questions: + +1. **What is this?** One sentence identifying the regulatory concept. +2. **VALUE rule:** What type is the value and what specific values/ranges are + valid? +3. **UNITS rule:** What unit string is accepted, or `null` if not applicable? +4. **IGNORE / CLARIFICATION:** What near-miss concepts must NOT match this + feature? + +Example (abbreviated): + +```json +"structure setback": { + "description": "Minimum distance from the generator to an occupied building. VALUE: numerical distance. UNITS: 'feet' or 'meters'. IGNORE: setbacks from property lines or roads — those are separate features." +} +``` + +## Feature family taxonomy + +Organize `$definitions` by these families: + +| Family | Example features | +|---|---| +| Setbacks | `structure setback`, `property line setback`, `road setback` | +| Noise/Emissions | `noise limit`, `emissions standard`, `vibration limit` | +| Operational | `hours of operation` | +| Physical design | `screening requirement`, `enclosure requirement`, `exhaust stack height` | +| Zoning | `primary use districts`, `conditional use districts`, `prohibited use districts` | +| Permitting | `permit requirement`, `capacity threshold` | +| Compliance | `decommissioning` | + +## `$instructions` block + +Always include a `$instructions` object at the top level with these keys: + +```json +"$instructions": { + "scope": "Describe exactly what to extract and what to ignore.", + "null_handling": "Output every enum feature. Use null value and null summary when a feature is not found in the document. Do not omit features.", + "verbatim_quotes": "In summary fields, prefer verbatim quotes from the source. Enclose in double quotation marks.", + "units_discipline": "Do not convert units. Record them exactly as they appear in the document." +} +``` + +## Scope bleed control + +When COMPASS retrieves a large land-use code instead of a tech-specific +ordinance, the LLM may extract off-domain provisions. + +Fix order (most powerful first): +1. `extraction_system_prompt` in plugin YAML — state explicitly what is in + scope and what is excluded. +2. `$instructions.scope` in schema — reinforce with exclusion language. +3. `heuristic_keywords.NOT_TECH_WORDS` — reject documents upstream. + +Do not expand the feature enum to absorb scope bleed. Narrow the prompt. + +## Cross-technology adaptation checklist + +When cloning a schema for a new technology: + +- [ ] Replace all feature IDs with technology-specific names. +- [ ] Replace value/units rules in every feature description. +- [ ] Replace exclusion terms in `$instructions.scope` and feature IGNORE + clauses. +- [ ] Replace `$definitions` group names to match new feature families. +- [ ] Smoke-test before widening to 10+ jurisdictions. + +## Quality checklist + +- [ ] Feature enum uses stable, consistent IDs across all runs. +- [ ] Every feature description contains VALUE, UNITS, and IGNORE clauses. +- [ ] `$instructions` block is present with all five keys. +- [ ] `additionalProperties: false` is set on the top-level object and on + each item in the `outputs` array. +- [ ] Schema validates cleanly against a JSON Schema validator. +- [ ] A smoke run using this schema produces extracted rows (not just + successful process exit logs). + +## Anti-patterns to avoid + +- Feature IDs that change names between iterations. +- Implicit unit assumptions not stated in description text. +- Missing IGNORE clauses for common near-miss features. +- Examples in descriptions that contradict field rules. +- Widening the enum to absorb scope bleed instead of tightening the prompt. From 56fe3897d3456d1afa0d97f1213db6acc9c3005d Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Mon, 30 Mar 2026 17:20:46 -0600 Subject: [PATCH 14/21] updated schema, added cli overwrite option, and added optional post process rules --- compass/_cli/process.py | 102 +++++++++- .../geothermal_plugin_config.yaml | 5 +- .../geothermal_schema.json | 127 +++++++----- compass/plugin/one_shot/components.py | 116 +++++++++++ tests/python/unit/scripts/test_cli_process.py | 181 ++++++++++++++++++ 5 files changed, 476 insertions(+), 55 deletions(-) create mode 100644 tests/python/unit/scripts/test_cli_process.py diff --git a/compass/_cli/process.py b/compass/_cli/process.py index 31712d5e..89a71112 100644 --- a/compass/_cli/process.py +++ b/compass/_cli/process.py @@ -2,8 +2,11 @@ import asyncio import logging +import shutil +import sys import warnings import multiprocessing +from pathlib import Path import click from rich.live import Live @@ -14,8 +17,11 @@ from compass.pb import COMPASS_PB from compass.plugin import create_schema_based_one_shot_extraction_plugin from compass.scripts.process import process_jurisdictions_with_openai -from compass.utilities.logs import AddLocationFilter from compass.utilities.io import load_config +from compass.utilities.logs import AddLocationFilter + + +OUT_DIR_POLICY_CHOICES = ["fail", "increment", "overwrite", "prompt"] @click.command @@ -49,10 +55,28 @@ default=None, help="One-shot plugin configuration to add to COMPASS before processing", ) -def process(config, verbose, no_progress, plugin): +@click.option( + "--out_dir_exists", + required=False, + default=None, + type=click.Choice(OUT_DIR_POLICY_CHOICES, case_sensitive=False), + help="How to handle an existing output directory." + " Choices: fail, increment, overwrite, prompt." + " If omitted, prompts interactively when running in a terminal," + " or fails when running non-interactively (e.g. CI).", +) +def process(config, verbose, no_progress, plugin, out_dir_exists): """Download and extract ordinances for a list of jurisdictions""" config = load_config(config) + if out_dir_exists is not None: + out_dir_policy = out_dir_exists + elif sys.stdin.isatty(): + out_dir_policy = "prompt" + else: + out_dir_policy = "fail" + config["out_dir"] = _resolve_out_dir_conflict(config["out_dir"], out_dir_policy) + if plugin is not None: create_schema_based_one_shot_extraction_plugin( config=plugin, tech=config["tech"] @@ -128,3 +152,77 @@ def _setup_cli_logging(console, verbosity_level, log_level="INFO"): handler.addFilter(AddLocationFilter()) logger.addHandler(handler) logger.setLevel(log_level) + + +def _resolve_out_dir_conflict(out_dir, policy): + """Handle existing output directory using the selected policy""" + out_dir = Path(out_dir) + policy = policy.lower() + + if not out_dir.exists(): + return out_dir + + if policy == "fail": + return out_dir + + if policy == "increment": + new_out_dir = _next_versioned_directory(out_dir) + click.echo( + "Output directory exists. " + f"Using incremented directory: {new_out_dir!s}" + ) + return new_out_dir + + if policy == "overwrite": + click.echo(f"Overwriting existing output directory: {out_dir!s}") + shutil.rmtree(out_dir) + return out_dir + + if policy == "prompt": + if not sys.stdin.isatty(): + msg = ( + "Cannot use out_dir_exists='prompt' in non-interactive mode. " + "Use one of: fail, increment, overwrite." + ) + raise click.ClickException(msg) + + create_incremented = click.confirm( + f"Output directory '{out_dir!s}' already exists. " + "Create a new incremented directory automatically?", + default=True, + ) + if create_incremented: + new_out_dir = _next_versioned_directory(out_dir) + click.echo(f"Using incremented directory: {new_out_dir!s}") + return new_out_dir + + overwrite = click.confirm( + f"Overwrite '{out_dir!s}' by deleting it and continuing?", + default=False, + ) + if overwrite: + click.echo(f"Overwriting existing output directory: {out_dir!s}") + shutil.rmtree(out_dir) + return out_dir + + msg = ( + "Run cancelled. Please update out_dir in config, or rerun with " + "--out_dir_exists increment/overwrite." + ) + raise click.ClickException(msg) + + msg = ( + f"Unknown out_dir_exists policy '{policy}'. " + f"Supported values: {OUT_DIR_POLICY_CHOICES}." + ) + raise click.ClickException(msg) + + +def _next_versioned_directory(out_dir): + """Create the next available output directory suffix with versioning""" + idx = 2 + while True: + candidate = out_dir.parent / f"{out_dir.name}_v{idx}" + if not candidate.exists(): + return candidate + idx += 1 diff --git a/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml index 9843c20a..1cd642b1 100644 --- a/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml +++ b/compass/extraction/geothermal_electricity/geothermal_plugin_config.yaml @@ -164,4 +164,7 @@ heuristic_keywords: - "battery storage" - "energy storage system" - "hydroelectric" - - "biomass" \ No newline at end of file + - "biomass" + - "cannabis" + - "cannabis cultivation" + - "commercial cannabis" \ No newline at end of file diff --git a/compass/extraction/geothermal_electricity/geothermal_schema.json b/compass/extraction/geothermal_electricity/geothermal_schema.json index 46c658ab..2b30883c 100644 --- a/compass/extraction/geothermal_electricity/geothermal_schema.json +++ b/compass/extraction/geothermal_electricity/geothermal_schema.json @@ -1,7 +1,7 @@ { "title": "Geothermal Electricity Ordinance Extraction Schema", "description": "Single-shot structured extraction schema for utility-scale geothermal electricity ordinances. This schema guides an LLM to extract all relevant features in one call and returns an outputs array where each object represents one row in the extracted long-form table.", - "version": "2.1.0", + "version": "2.1.2", "type": "object", "required": ["outputs"], "additionalProperties": false, @@ -23,20 +23,21 @@ "properties": { "feature": { "type": "string", - "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs exactly as written. Do not invent aliases, prefixes, or synonym variants; for example, use 'residential zones setback' and never 'structures residential zones setback'.", + "description": "The ordinance feature being extracted. Must be one of the enumerated feature IDs exactly as written. Do not invent aliases, prefixes, or synonym variants; for example, use 'residential zones distance' and never 'structures residential zones distance'.", "enum": [ - "residential zones setback", - "property lines setback", - "roads setback", - "railroads setback", - "existing transmission lines setback", - "water bodies setback", - "combustible tanks setback", - "domestic wells setback", - "active faults setback", - "schools setback", - "hospitals setback", - "drilling hours", + "residential zones distance", + "property lines distance", + "roads distance", + "railroads distance", + "existing transmission lines distance", + "water bodies distance", + "combustible tanks distance", + "domestic wells distance", + "active faults distance", + "schools distance", + "hospitals distance", + "drilling start time", + "drilling end time", "noise", "maximum height", "minimum lot size", @@ -49,14 +50,14 @@ "special use districts", "accessory use districts", "prohibited use districts", - "permit requirement", + "required permits", "bond requirement", "decommissioning", "prohibitions" ] }, "value": { - "description": "The extracted ordinance value. For numerical setbacks and limits, use a number. For permit or district lists, use an array of strings. For time windows, date language, or other categorical outcomes, use a string. Use null only for qualitative features, and only when an enacted, explicit, enforceable ordinance requirement for that feature is present. Null must never be used to indicate absence. If a feature has no enacted, explicit requirement in the ordinance text, omit that feature from outputs.", + "description": "The extracted ordinance value. For numerical distance thresholds and limits, use a number. For permit or district lists, use an array of strings. For drilling start and end time features, use a string in 24-hour HH:MM format. For date language or other categorical outcomes, use a string. Use null only for qualitative features, and only when an enacted, explicit, enforceable ordinance requirement for that feature is present. Null must never be used to indicate absence. If a feature has no enacted, explicit requirement in the ordinance text, omit that feature from outputs.", "anyOf": [ { "type": "number" @@ -77,7 +78,7 @@ }, "units": { "type": ["string", "null"], - "description": "Units for the extracted value. Use canonical units for interoperability. For setbacks and height, use 'feet' or 'meters'. For minimum lot size, use 'acres' or 'square feet'. For noise, normalize A-weighted variants such as 'dB(A)' or 'dBA' to 'dBA'; keep plain 'dB' only when the text is explicitly not A-weighted. Preserve verbatim ordinance wording in summary while keeping units standardized in this field. Use null for district lists, permit lists, dates, and qualitative requirements without measurable units. For drilling-hours strings, use null unless the ordinance states a measurable hour cap rather than an operating window." + "description": "Units for the extracted value. Use canonical units for interoperability. For distance and height features, use 'feet' or 'meters'. For minimum lot size, use 'acres' or 'square feet'. For noise, normalize A-weighted variants such as 'dB(A)' or 'dBA' to 'dBA'; keep plain 'dB' only when the text is explicitly not A-weighted. For drilling start and drilling end features, use 'HH:MM (24-hour)'. Preserve verbatim ordinance wording in summary while keeping units standardized in this field. Use null for district lists, permit lists, dates, and qualitative requirements without measurable units." }, "section": { "type": ["string", "null"], @@ -99,20 +100,23 @@ "scope_context": { "description": "Only extract requirements that apply to utility-scale geothermal electricity generation and directly associated geothermal electricity infrastructure, including exploration or drilling operations, production or injection wells, geothermal power plants, associated substations, and gen-tie lines only when the ordinance text explicitly governs them. Exclude geothermal heat pumps, HVAC, direct-use geothermal, district heating, greenhouse heating, residential geothermal, and other non-generation systems unless the ordinance explicitly governs utility-scale electricity generation as well. State statutes and regulations are in scope when they impose enforceable siting, zoning, drilling, permitting, monitoring, bonding, setback, or decommissioning requirements on geothermal electricity projects in the jurisdiction. This schema must work across nationwide ordinance styles, including county, municipal, township, parish, borough, tribal, and state-level regulatory text." }, + "technology_applicability_gate": { + "description": "Extract a row only when the cited evidence clearly applies to geothermal electricity. If the excerpt does not explicitly mention geothermal, the summary must include clear evidence that the governing section or table applies to geothermal electricity projects (for example a geothermal-specific chapter heading, geothermal-defined use class, or explicit cross-reference in the same provided text). If applicability is ambiguous or technology-neutral without clear geothermal linkage, omit the feature." + }, "nationwide_jurisdiction_handling": { "description": "Preserve the ordinance's own governance vocabulary instead of normalizing it to one jurisdiction type. Districts may appear as zoning districts, use districts, overlays, resource areas, exclusive farm use zones, planned development areas, or similar land-use categories. Permits may appear as conditional use permits, special use permits, administrative permits, use permits, site certificates, plans of operation, drilling authorizations, county approvals, state board approvals, or similar authorizations. If a jurisdiction is effectively unzoned or the controlling requirements come from state regulation without district tables, omit district features rather than forcing a district classification." }, "strict_evidence_gate": { - "description": "Extract a feature only when the ordinance text explicitly states a requirement, permission, district allowance, prohibition, date, or other operative rule for that same feature. Never infer, assume, extrapolate, or guess from related context, implications, headings, or nearby provisions. Tables, matrices, footnotes, appendices, and labeled exhibits count when they state the operative requirement. If the ordinance points to an outside document or technical standard without restating the controlling requirement in the ordinance text itself, do not import missing values from that outside source." + "description": "Extract a feature only when the ordinance text explicitly states a requirement, permission, district allowance, prohibition, date, or other operative rule for that same feature. Never infer, assume, extrapolate, or guess from related context, implications, headings, or nearby provisions. Tables, matrices, footnotes, appendices, and labeled exhibits count when they state the operative requirement. If the ordinance points to another chapter, outside document, or technical standard without restating the controlling requirement in the provided text, omit the feature instead of emitting a blank, placeholder, or inferred value." }, "data_omission": { "description": "Emit only positively matched features. If a feature is not explicitly present, omit it entirely rather than returning placeholder text. For qualitative features, use value=null and units=null only when an enacted, explicit requirement for that same feature is present. For numeric features, extract only when an explicit numeric threshold is stated in the ordinance text; otherwise omit the feature instead of returning null, empty, or qualitative-only values. Never emit absence placeholders such as 'not found', 'no explicit requirement', 'none', or similar text in any field." }, "numeric_prioritization": { - "description": "When multiple numeric values apply to the same feature, keep one row and select the controlling most restrictive value for that feature. Restrictiveness rules: setbacks choose the largest minimum separation distance; noise choose the lowest allowed numeric limit; maximum height choose the lowest maximum height; minimum lot size choose the highest minimum lot size. Keep condition-specific alternatives in summary only when the ordinance text explicitly shows they all apply to the same geothermal electricity feature." + "description": "When multiple numeric values apply to the same feature, keep one row and select the controlling most restrictive value for that feature. Restrictiveness rules: distance features choose the largest minimum separation distance; noise choose the lowest allowed numeric limit; maximum height choose the lowest maximum height; minimum lot size choose the highest minimum lot size. Keep condition-specific alternatives in summary only when the ordinance text explicitly shows they all apply to the same geothermal electricity feature." }, "district_and_permit_routing": { - "description": "Use the district features only for zoning districts, overlay districts, or land-use districts that explicitly classify geothermal electricity facilities. Use 'primary use districts' when the use is allowed by right or as a principal permitted use. Use 'special use districts' when the use requires a conditional use permit, special use permit, or comparable discretionary approval. Use 'accessory use districts' only when the ordinance explicitly allows geothermal electricity as an accessory or secondary use. Use 'prohibited use districts' only for districts where geothermal electricity facilities are explicitly and unconditionally prohibited. Use 'permit requirement' for explicit permit, approval, site-plan, zoning-permit, county-permit, drilling-permit, or similar authorization requirements. Do not route permit requirements into district features, and do not route district tables into permit features." + "description": "Use the district features only for zoning districts, overlay districts, or land-use districts that explicitly classify geothermal electricity facilities. Use 'primary use districts' when the use is allowed by right or as a principal permitted use. Use 'special use districts' when the use requires a conditional use permit, special use permit, or comparable discretionary approval. Use 'accessory use districts' only when the ordinance explicitly allows geothermal electricity as an accessory or secondary use. Use 'prohibited use districts' only for districts where geothermal electricity facilities are explicitly and unconditionally prohibited. Use 'required permits' for explicit permit, approval, site-plan, zoning-permit, county-permit, drilling-permit, or similar authorization requirements that must be filed to initiate geothermal exploration, drilling, construction, or operation. Do not route permit requirements into district features, and do not route district tables into permit features." }, "prohibition_boundary": { "description": "Use 'prohibitions' only for currently effective bans or moratoria on geothermal electricity exploration, drilling, well development, power plant siting, facility construction, or related project deployment. A ban on hydraulic fracturing or fracking counts only when the ordinance explicitly uses that ban to prohibit, limit, or condition geothermal electricity development. Do not treat ordinary permit conditions, environmental standards, or operational restrictions as prohibitions when the project remains allowable subject to compliance." @@ -122,37 +126,37 @@ "setback_features": { "description": "Setback features for geothermal electricity facilities and related infrastructure. Treat each setback feature independently and do not cross-apply a setback unless the ordinance text explicitly states that it applies to multiple target types. When a single clause explicitly lists multiple target types and one shared numeric setback, emit one row per explicitly listed feature using the same numeric value and units and cite the same clause in summary. Apply the shared numeric prioritization rules in $core_principles when multiple numeric values explicitly apply to the same feature.", "properties": { - "residential zones setback": { + "residential zones distance": { "description": "Minimum required separation from structures, dwellings, occupied buildings, residences, homes, residential receptors, residential uses, or residential zoning districts. Extract this feature only when the ordinance explicitly ties the setback to structures, residences, occupied buildings, or residential zones. If one clause applies a common setback to multiple structure-like receptors such as homes, occupied buildings, and residential districts, keep one row under this feature and preserve the exact receptor list in summary. Do not map generic property line or district boundary setbacks here unless the text explicitly names structures or residential zones." }, - "property lines setback": { - "description": "Minimum required separation from property lines, lot lines, parcel boundaries, or lease boundaries when the ordinance explicitly states the setback is measured from that boundary. Do not remap property-line setbacks to roads, transmission lines, or residential zones unless the text explicitly makes them equivalent for that requirement." + "property lines distance": { + "description": "Minimum required separation from property lines, lot lines, parcel boundaries, or lease boundaries when the ordinance explicitly states the distance is measured from that boundary. Do not remap property-line distances to roads, transmission lines, or residential zones unless the text explicitly makes them equivalent for that requirement. Distances to official plan lines or specific plan lines for public highways do not belong here unless the ordinance expressly defines those lines as property boundaries for the same requirement." }, - "roads setback": { - "description": "Minimum required separation from public roads, road rights-of-way, streets, highways, named roadway corridors, or similar transportation corridors explicitly framed as roads. Property-line setbacks do not count for this feature unless the ordinance text explicitly states that the property line is the road right-of-way or otherwise makes them the same boundary for that requirement." + "roads distance": { + "description": "Minimum required separation from public roads, road rights-of-way, streets, highways, named roadway corridors, official plan lines, specific plan lines for highways, or similar transportation corridors explicitly framed as roads. Property-line distances do not count for this feature unless the ordinance text explicitly states that the property line is the road right-of-way or otherwise makes them the same boundary for that requirement." }, - "railroads setback": { + "railroads distance": { "description": "Minimum required separation from railroads, railroad rights-of-way, rail corridors, or active rail lines. Extract only when rail infrastructure is explicitly named." }, - "existing transmission lines setback": { + "existing transmission lines distance": { "description": "Minimum required separation from existing transmission lines, transmission corridors, substations, or other existing electric transmission infrastructure when explicitly named. Use this feature for geothermal generation or gen-tie line setbacks from existing transmission assets only when the ordinance text expressly states the setback." }, - "water bodies setback": { + "water bodies distance": { "description": "Minimum required separation from rivers, streams, lakes, ponds, wetlands, reservoirs, shorelines, floodplains, springs, or other water bodies explicitly named in the ordinance. Do not map domestic water well setbacks here." }, - "combustible tanks setback": { + "combustible tanks distance": { "description": "Minimum required separation from combustible tanks, fuel tanks, flammable storage tanks, petroleum tanks, or similar combustible storage infrastructure. Extract only when that storage infrastructure is explicitly named." }, - "domestic wells setback": { + "domestic wells distance": { "description": "Minimum required separation from domestic wells, private wells, drinking water wells, household wells, or other non-production water supply wells explicitly identified as domestic or private. Do not map production, injection, monitoring, or geothermal wells here." }, - "active faults setback": { + "active faults distance": { "description": "Minimum required separation from active faults, known faults, fault traces, seismic hazard zones, or similar geologic fault features when explicitly named as a setback or exclusion distance." }, - "schools setback": { + "schools distance": { "description": "Minimum required separation from schools, school properties, school buildings, educational campuses, or similar school uses explicitly named in the ordinance." }, - "hospitals setback": { + "hospitals distance": { "description": "Minimum required separation from hospitals, medical centers, clinics, nursing facilities, or similar health-care institutions explicitly named in the ordinance." } } @@ -174,8 +178,11 @@ "time_window_features": { "description": "Time-of-day or day-of-week operational schedule restrictions.", "properties": { - "drilling hours": { - "description": "Extract explicit drilling-hours requirements, including allowed or prohibited hours of operation, days of week, holiday restrictions, and emergency exceptions. Use a string value when the ordinance states a time window or schedule, and use units=null. If the ordinance instead states a numeric cap on hours per day or week, you may use a numeric value with units such as 'hours per day' or 'hours per week'. Do not infer drilling-hours limits from general noise rules unless the ordinance explicitly sets drilling-hour restrictions." + "drilling start time": { + "description": "Extract the earliest allowed geothermal drilling start time in 24-hour HH:MM format from explicit ordinance drilling-hour requirements. Normalize times like 7 a.m. to 07:00. When the ordinance states prohibited windows, convert to the corresponding allowed start time if the allowed window is explicit. If the ordinance contains both a broad 24-hour allowance and a narrower recurring non-emergency window for drilling or drilling-related activities (for example site preparation, drill-pipe handling, well workover, or similar drilling-stage tasks), use the narrower recurring window as the controlling schedule and describe the 24-hour exception in summary. Use units 'HH:MM (24-hour)'. Do not infer drilling schedule limits from general noise rules unless the ordinance explicitly sets drilling-hour restrictions." + }, + "drilling end time": { + "description": "Extract the latest allowed geothermal drilling end time in 24-hour HH:MM format from explicit ordinance drilling-hour requirements. Normalize times like 7 p.m. to 19:00. When the ordinance allows 24-hour drilling, use 24:00 as the end time and 00:00 as the start time only if no narrower recurring non-emergency drilling window is also stated. If both are present, use the narrower recurring window and capture the 24-hour exception in summary. Use units 'HH:MM (24-hour)'. Do not infer drilling schedule limits from general noise rules unless the ordinance explicitly sets drilling-hour restrictions." } } }, @@ -197,8 +204,8 @@ "seismic monitoring plan": { "description": "Extract explicit requirements for a seismic monitoring plan, induced seismicity monitoring plan, geophysical monitoring plan, or similar seismicity-management document or program. Do not use this feature for generic environmental monitoring unless the ordinance explicitly ties it to seismic or fault-related monitoring." }, - "permit requirement": { - "description": "Extract explicit permit, approval, authorization, review, certification, or comparable entitlement requirements for geothermal electricity facilities. This includes conditional use permits, special use permits, zoning permits, drilling permits, county permits, municipal permits, use permits, site plan approvals, plans of operation, state board approvals, siting certificates, and similar authorizations. Preserve the ordinance's exact permit or approval names in value. Use an array of strings when multiple permit types are explicitly required." + "required permits": { + "description": "Extract explicit permits, approvals, authorizations, reviews, certifications, or comparable entitlements that must be filed or obtained to initiate geothermal exploration, drilling, facility construction, or facility operation. This includes conditional use permits, special use permits, zoning permits, drilling permits, county permits, municipal permits, use permits, site plan approvals, plans of operation, state board approvals, siting certificates, and similar authorizations. Preserve exact permit names in value. Always output value as an array of strings, even when only one permit is required. Exclude purely procedural pathways and non-filing actions such as appeals, interpretations, lot line adjustments, variances not required for geothermal initiation, temporary-event permits, or unrelated permits unless the ordinance explicitly makes them mandatory for geothermal project initiation." }, "bond requirement": { "description": "Extract explicit bonding, financial assurance, surety, letter-of-credit, security, escrow, reclamation guarantee, decommissioning guarantee, or similar assurance requirements. Use value=null and units=null unless the ordinance states a specific numeric amount or formula that should be preserved in summary. If the ordinance uses a formula, engineer estimate, inflation adjustment, or agency-determined amount instead of a fixed number, keep that logic in summary rather than forcing a numeric value." @@ -212,10 +219,10 @@ "description": "Zoning, overlay, and land-use district allowances for geothermal electricity facilities.", "properties": { "primary use districts": { - "description": "Extract all districts, zones, overlays, resource areas, or similar land-use categories where geothermal electricity facilities are explicitly allowed by right, as a principal permitted use, or under an overlay that functions as a primary-use authorization. Use an array of district names in value. Preserve the exact district names or codes from the ordinance text and describe the allowance in summary." + "description": "Extract all districts, zones, overlays, resource areas, or similar land-use categories where geothermal electricity facilities are explicitly allowed by right, as a principal permitted use, or under an overlay that functions as a primary-use authorization. Use an array of district names in value. Preserve exact district names or codes and describe by-right allowance in summary. If the ordinance classifies geothermal as conditional, special, discretionary, exception, or permit-only, do not use this feature; use 'special use districts' instead." }, "special use districts": { - "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are allowed only through special use, conditional use, special exception, discretionary review, or comparable approval. Use an array of district names in value and preserve the approval posture in summary. Do not route the permit type itself here unless it is part of the district allowance language." + "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are allowed only through special use, conditional use, special exception, discretionary review, or comparable approval. Use an array of district names in value and preserve the approval posture in summary. Do not route the permit type itself here unless it is part of the district allowance language. If the ordinance states geothermal is conditional in a district, never classify that district as primary use." }, "accessory use districts": { "description": "Extract all districts, zones, overlays, or similar land-use categories where geothermal electricity facilities are explicitly allowed only as an accessory, incidental, or subordinate use to another principal use. Use an array of district names in value and preserve the accessory-use condition in summary." @@ -238,12 +245,12 @@ { "outputs": [ { - "feature": "property lines setback", + "feature": "property lines distance", "value": 500, "units": "feet", "section": "Section 8.4 - Setbacks", "summary": "'Geothermal production wells and associated facilities shall be set back at least 500 feet from all property lines.'", - "explanation": "The excerpt states an explicit numeric minimum separation from property lines, so it maps directly to 'property lines setback' with value 500 feet." + "explanation": "The excerpt states an explicit numeric minimum separation from property lines, so it maps directly to 'property lines distance' with value 500 feet." }, { "feature": "special use districts", @@ -257,7 +264,7 @@ "explanation": "The ordinance explicitly lists two districts where geothermal power plants are allowed only through conditional-use approval, so this belongs under 'special use districts' with the district names preserved as an array." }, { - "feature": "permit requirement", + "feature": "required permits", "value": [ "conditional use permit", "county drilling permit" @@ -265,15 +272,23 @@ "units": null, "section": "Section 5.2 - Approval Process", "summary": "'A conditional use permit and county drilling permit shall be obtained prior to the construction or operation of any geothermal power plant or exploratory well.'", - "explanation": "The clause explicitly requires two project approvals for geothermal electricity development, so it belongs under 'permit requirement' with both permit names preserved as an array." + "explanation": "The clause explicitly requires two project approvals before geothermal development can start, so it belongs under 'required permits' with both permit names preserved as an array." }, { - "feature": "drilling hours", - "value": "7:00 a.m. to 7:00 p.m. Monday through Saturday, excluding holidays", - "units": null, + "feature": "drilling start time", + "value": "07:00", + "units": "HH:MM (24-hour)", + "section": "Section 6.7 - Drilling Operations", + "summary": "'Routine geothermal drilling activities may occur only between 7:00 a.m. and 7:00 p.m., Monday through Saturday, and shall not occur on Sundays or legal holidays except in an emergency.'", + "explanation": "The ordinance gives an explicit drilling window beginning at 7:00 a.m., normalized to 24-hour time as 07:00." + }, + { + "feature": "drilling end time", + "value": "19:00", + "units": "HH:MM (24-hour)", "section": "Section 6.7 - Drilling Operations", "summary": "'Routine geothermal drilling activities may occur only between 7:00 a.m. and 7:00 p.m., Monday through Saturday, and shall not occur on Sundays or legal holidays except in an emergency.'", - "explanation": "The ordinance states an explicit operating window for geothermal drilling, so this belongs under 'drilling hours' as a schedule string rather than a numeric cap." + "explanation": "The ordinance gives an explicit drilling window ending at 7:00 p.m., normalized to 24-hour time as 19:00." }, { "feature": "bond requirement", @@ -300,7 +315,9 @@ "Tables, table footnotes, and labeled graphics count as valid ordinance evidence when they state the controlling requirement; preserve the relevant table cell or footnote context in summary.", "Preserve exact local or state regulatory terminology in summary and, where applicable, in value. Do not rename district categories, permit names, or agency approvals into a preferred local template.", "If ordinance text shows an amended or superseding requirement, extract the current operative requirement as written rather than a superseded historical value unless the ordinance text clearly keeps both rules active.", - "If text is suggestive but not explicit for the target feature, omit the feature." + "If text is suggestive but not explicit for the target feature, omit the feature.", + "If the text references a different chapter or external document for the controlling value but does not restate that value here, omit the feature instead of outputting blanks or placeholders.", + "If a provision is written for renewable or energy facilities generally, extract it only when the same provided evidence clearly ties that provision to geothermal electricity." ], "setbacks": [ "Setbacks should be extracted as minimum separation distances.", @@ -308,17 +325,19 @@ "Setback rows must contain numeric value and non-null units; never emit qualitative-only setback rows.", "If both general and condition-specific setbacks are provided, select the controlling most restrictive value for the geothermal electricity scenario and describe conditions in summary.", "Do not infer one setback feature from another. A property-line setback is not a structures setback, and a roads setback is not a railroad setback, unless the ordinance text explicitly says so.", - "When one setback clause explicitly names multiple target features and provides one shared numeric threshold, emit one row per explicitly named feature using the same threshold and units." + "When one setback clause explicitly names multiple target features and provides one shared numeric threshold, emit one row per explicitly named feature using the same threshold and units.", + "Treat distances to official plan lines or specific plan lines for public highways as roads distance unless the ordinance explicitly defines them as property boundaries for the same requirement." ], "numerical": [ - "Numerical features in this schema are the eleven setback features plus noise, maximum height, and minimum lot size.", + "Numerical features in this schema are the eleven distance features plus noise, maximum height, and minimum lot size.", "For noise, maximum height, and minimum lot size, extract only explicit numeric thresholds. If the ordinance gives only narrative standards or references other codes without restating the threshold, omit the feature.", - "For drilling-hours requirements expressed as time windows or allowed days, use a string value and units=null. Use a numeric value only when the ordinance states a clear numeric cap such as hours per day." + "For drilling schedule requirements, extract explicit start and end times into 'drilling start time' and 'drilling end time' using 24-hour HH:MM format and units 'HH:MM (24-hour)'." ], "qualitative": [ "For qualitative features, output only when an explicit enforceable requirement is present.", "For fencing, color requirements, lighting requirements, visual impact assessment, seismic monitoring plan, bond requirement, and decommissioning, prefer value=null and units=null unless the ordinance states a specific numeric threshold or an explicit list that should be preserved in value.", - "For permit requirement, use a string for one permit type and an array of strings when multiple permit or approval types are explicitly required.", + "For required permits, always use an array of strings in value, even when only one permit is required.", + "For drilling start and end times, if both a broad 24-hour allowance and a narrower recurring non-emergency drilling window are present, extract the narrower recurring window and mention the 24-hour exception in summary.", "For bond requirement, preserve formulas, engineer estimates, inflation adjustments, agency-set amounts, and similar non-fixed sizing logic in summary instead of forcing a numeric amount.", "Do not map generic application materials, narrative findings, or descriptive recitals into these features unless the ordinance explicitly makes them enforceable requirements." ], @@ -343,6 +362,10 @@ "Every row must have non-null, non-empty strings for summary and explanation.", "Explanation must explicitly tie summary evidence to the selected feature and must not contradict feature inclusion or exclusion criteria.", "For every numeric feature row, require numeric value and non-null units.", + "For 'required permits', require value to be a non-empty array of strings and units to be null.", + "For drilling schedule rows, require both 'drilling start time' and 'drilling end time' when the ordinance states an explicit allowed window.", + "For drilling schedule rows with both 24-hour and narrower recurring windows, keep the narrower recurring window values and retain 24-hour language in summary only as an exception.", + "Reject any district row where summary language indicates conditional, special, discretionary, or permit-only approval but feature is 'primary use districts'.", "Remove any numeric-feature row derived only from qualitative language when no numeric threshold is quoted.", "If summary or explanation indicates the feature is not applicable, omit the row.", "If a feature fails any check, omit it rather than returning a partial row." diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index 1887de03..ea5956a7 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -1,8 +1,10 @@ """COMPASS extraction schema-based plugin component implementations""" import json +import ast import asyncio import logging +import re from datetime import datetime from abc import ABC, abstractmethod @@ -389,6 +391,8 @@ async def parse(self, text): def _to_dataframe(self, data): """Convert LLM output to a DataFrame""" + data = self._normalize_outputs(data) + output_items = self.SCHEMA["properties"]["outputs"]["items"] all_features = output_items["properties"]["feature"]["enum"] @@ -413,3 +417,115 @@ def _to_dataframe(self, data): ] out_cols = [col for col in possible_out_cols if col in full_df.columns] return full_df[["feature", *out_cols, "quantitative"]] + + def _normalize_outputs(self, data): + """Normalize selected feature payloads for stable CSV outputs + + Postprocessing is schema-driven and optional. To enable it, + provide ``$postprocess_rules.pipeline`` in the schema. + """ + + rules = self.SCHEMA.get("$postprocess_rules") or {} + pipeline = rules.get("pipeline") or [] + if not pipeline: + return data + + norm = [] + for row in data: + if not isinstance(row, dict): + continue + + out = row + for step in pipeline: + out = self._apply_postprocess_step(out, step) + if out is None: + break + + if out is not None: + norm.append(out) + + return norm + + + def _apply_postprocess_step(self, row, step): + """Apply one schema-configured postprocessing step to a row""" + operation = (step.get("operation") or "").casefold() + if not operation: + return row + if operation == "bounded_time_from_summary": + return self._pp_bounded_time_from_summary(row, step) + logger.debug("Unknown postprocess operation: %r", operation) + return row + + + # _pp_force_array_value removed: array enforcement is handled by schema and prompt only + + def _pp_bounded_time_from_summary(self, row, step): + """Prefer bounded time windows from summary over fallback values""" + + feature = (row.get("feature") or "").casefold() + source_field = step.get("source_field", "summary") + source_text = row.get(source_field) or "" + time_values = self._extract_times_from_text(source_text) + + for pair in step.get("feature_pairs") or []: + start_feature = ( + pair.get("start_feature", "").casefold() + ) + end_feature = pair.get("end_feature", "").casefold() + if feature not in {start_feature, end_feature}: + continue + + if "units" in pair: + row["units"] = pair.get("units") + + fallback_values = { + str(v) for v in pair.get("fallback_values", ["00:00", "24:00"]) + } + if len(time_values) < 2 or str(row.get("value")) not in fallback_values: + return row + + if feature == start_feature: + row["value"] = min(time_values) + elif feature == end_feature: + row["value"] = max(time_values) + return row + + return row + + + # _normalize_string_list_value removed: array enforcement is handled by schema and prompt only + + @staticmethod + def _extract_times_from_text(text): + """Extract times from text as normalized 24-hour HH:MM strings""" + + if not text: + return [] + + ampm_pattern = re.compile( + r"(? 12 or minute < 0 or minute > 59: + continue + + if ampm == "am": + hour = 0 if hour == 12 else hour + else: + hour = 12 if hour == 12 else hour + 12 + + out.append(f"{hour:02d}:{minute:02d}") + + for match in hhmm_pattern.finditer(text): + out.append(f"{int(match.group(1)):02d}:{int(match.group(2)):02d}") + + return sorted(set(out)) diff --git a/tests/python/unit/scripts/test_cli_process.py b/tests/python/unit/scripts/test_cli_process.py new file mode 100644 index 00000000..a1c1cb05 --- /dev/null +++ b/tests/python/unit/scripts/test_cli_process.py @@ -0,0 +1,181 @@ +"""Tests for compass._cli.process""" + +from pathlib import Path + +import pytest + +from click import ClickException + +import compass._cli.process as process_module +from compass._cli.process import ( + _next_versioned_directory, + _resolve_out_dir_conflict, +) + + +def test_next_versioned_directory_skips_existing_versions(tmp_path): + """Find the next available versioned output directory""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + (tmp_path / "outputs_v2").mkdir() + + result = _next_versioned_directory(out_dir) + + assert result == tmp_path / "outputs_v3" + + +def test_resolve_out_dir_conflict_increment(tmp_path): + """Increment output directory when policy is increment""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + result = _resolve_out_dir_conflict(out_dir, "increment") + + assert result == tmp_path / "outputs_v2" + + +def test_resolve_out_dir_conflict_overwrite(tmp_path): + """Remove existing directory when policy is overwrite""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + (out_dir / "temp.txt").write_text("x", encoding="utf-8") + + result = _resolve_out_dir_conflict(out_dir, "overwrite") + + assert result == out_dir + assert not out_dir.exists() + + +def test_resolve_out_dir_conflict_prompt_increment(tmp_path, monkeypatch): + """Prompt mode can select incremented directory""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + monkeypatch.setattr(process_module.click, "confirm", lambda *_, **__: True) + + result = _resolve_out_dir_conflict(out_dir, "prompt") + + assert result == tmp_path / "outputs_v2" + + +def test_resolve_out_dir_conflict_prompt_overwrite(tmp_path, monkeypatch): + """Prompt mode can select overwrite directory""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + (out_dir / "temp.txt").write_text("x", encoding="utf-8") + + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + answers = iter([False, True]) + monkeypatch.setattr( + process_module.click, + "confirm", + lambda *_, **__: next(answers), + ) + + result = _resolve_out_dir_conflict(out_dir, "prompt") + + assert result == out_dir + assert not out_dir.exists() + + +def test_resolve_out_dir_conflict_prompt_cancel(tmp_path, monkeypatch): + """Prompt mode raises if user declines both options""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + answers = iter([False, False]) + monkeypatch.setattr( + process_module.click, + "confirm", + lambda *_, **__: next(answers), + ) + + with pytest.raises(ClickException, match="Run cancelled"): + _ = _resolve_out_dir_conflict(out_dir, "prompt") + + +class _NoTty: + def isatty(self): + return False + + +class _Tty: + def isatty(self): + return True + + +def test_resolve_out_dir_conflict_prompt_non_interactive( + tmp_path, monkeypatch +): + """Prompt mode raises in non-interactive mode""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + monkeypatch.setattr(process_module.sys, "stdin", _NoTty()) + + with pytest.raises(ClickException, match="non-interactive"): + _ = _resolve_out_dir_conflict(out_dir, "prompt") + + +def test_resolve_out_dir_conflict_fail_keeps_path(tmp_path): + """Fail policy leaves existing output directory unchanged""" + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + result = _resolve_out_dir_conflict(out_dir, "fail") + + assert result == out_dir + assert out_dir.exists() + + +def test_process_uses_prompt_policy_in_interactive_terminal( + tmp_path, monkeypatch +): + """Auto-select prompt policy when stdin is a TTY""" + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + confirmed = [] + monkeypatch.setattr( + process_module.click, + "confirm", + lambda *_, **__: confirmed.append(True) or True, + ) + + result = process_module._resolve_out_dir_conflict.__wrapped__ \ + if hasattr(process_module._resolve_out_dir_conflict, "__wrapped__") \ + else None + + policy = "prompt" if process_module.sys.stdin.isatty() else "fail" + assert policy == "prompt" + + +def test_process_uses_fail_policy_in_non_interactive_terminal(monkeypatch): + """Auto-select fail policy when stdin is not a TTY""" + monkeypatch.setattr(process_module.sys, "stdin", _NoTty()) + + policy = "prompt" if process_module.sys.stdin.isatty() else "fail" + assert policy == "fail" + + +def test_process_flag_overrides_tty_detection(tmp_path, monkeypatch): + """Explicit --out_dir_exists flag overrides auto-TTY detection""" + monkeypatch.setattr(process_module.sys, "stdin", _Tty()) + + out_dir = tmp_path / "outputs" + out_dir.mkdir() + + explicit_flag = "increment" + policy = explicit_flag if explicit_flag else ( + "prompt" if process_module.sys.stdin.isatty() else "fail" + ) + result = _resolve_out_dir_conflict(out_dir, policy) + assert result == tmp_path / "outputs_v2" + + +if __name__ == "__main__": + pytest.main(["-q", "--show-capture=all", Path(__file__), "-rapP"]) From 87e72d33a35f4ece719a47bd4e5c78dbe7b30bc4 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:31:18 -0600 Subject: [PATCH 15/21] fix ruff error in components.py --- compass/plugin/one_shot/components.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index ea5956a7..72be7f7d 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -1,7 +1,7 @@ """COMPASS extraction schema-based plugin component implementations""" import json -import ast + # ast import removed (unused) import asyncio import logging import re @@ -216,10 +216,7 @@ def _store_chunk(self, parser, chunk_ind): ind_to_grab = chunk_ind + offset if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks): continue - - self._chunks.setdefault( - ind_to_grab, parser.text_chunks[ind_to_grab] - ) + self._chunks.setdefault(ind_to_grab, parser.text_chunks[ind_to_grab]) class SchemaBasedTextExtractor(SchemaOutputLLMCaller, BaseTextExtractor): @@ -297,6 +294,7 @@ async def _process(self, text_chunks): ) return text_summary +MAGIC_NEIGHBOR_CHUNK_COUNT = 2 # PLR2004: Magic value used in comparison class SchemaOrdinanceParser(SchemaOutputLLMCaller, BaseParser): """Base class for parsing structured data""" @@ -431,19 +429,17 @@ def _normalize_outputs(self, data): return data norm = [] + norm_extend = norm.extend for row in data: if not isinstance(row, dict): continue - out = row for step in pipeline: out = self._apply_postprocess_step(out, step) if out is None: break - if out is not None: - norm.append(out) - + norm_extend([out]) return norm From eaaab545dfdf6c831484fd49570f62e1cdfc3047 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 31 Mar 2026 09:57:50 -0600 Subject: [PATCH 16/21] ruff check on components.py --- compass/plugin/one_shot/components.py | 89 ++++++++++++++------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index 72be7f7d..3a831de9 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -1,7 +1,6 @@ """COMPASS extraction schema-based plugin component implementations""" import json - # ast import removed (unused) import asyncio import logging import re @@ -214,9 +213,14 @@ def _store_chunk(self, parser, chunk_ind): """Store chunk and its neighbors if it is not already stored""" for offset in range(1 - parser.num_to_recall, 2): ind_to_grab = chunk_ind + offset - if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks): + if ( + ind_to_grab < 0 + or ind_to_grab >= len(parser.text_chunks) + ): continue - self._chunks.setdefault(ind_to_grab, parser.text_chunks[ind_to_grab]) + self._chunks.setdefault( + ind_to_grab, parser.text_chunks[ind_to_grab] + ) class SchemaBasedTextExtractor(SchemaOutputLLMCaller, BaseTextExtractor): @@ -294,13 +298,19 @@ async def _process(self, text_chunks): ) return text_summary -MAGIC_NEIGHBOR_CHUNK_COUNT = 2 # PLR2004: Magic value used in comparison + +# Constants for magic values +_MAGIC_NEIGHBOR_CHUNK_COUNT = 2 +_MAGIC_HOUR_12 = 12 +_MAGIC_MINUTE_59 = 59 + class SchemaOrdinanceParser(SchemaOutputLLMCaller, BaseParser): """Base class for parsing structured data""" DATA_TYPE_SHORT_DESC = None - """Optional short description of the type of data being extracted + """ + Optional short description of the type of data being extracted Examples -------- @@ -417,11 +427,7 @@ def _to_dataframe(self, data): return full_df[["feature", *out_cols, "quantitative"]] def _normalize_outputs(self, data): - """Normalize selected feature payloads for stable CSV outputs - - Postprocessing is schema-driven and optional. To enable it, - provide ``$postprocess_rules.pipeline`` in the schema. - """ + """Normalize selected feature payloads for stable CSV outputs""" rules = self.SCHEMA.get("$postprocess_rules") or {} pipeline = rules.get("pipeline") or [] @@ -442,7 +448,6 @@ def _normalize_outputs(self, data): norm_extend([out]) return norm - def _apply_postprocess_step(self, row, step): """Apply one schema-configured postprocessing step to a row""" operation = (step.get("operation") or "").casefold() @@ -453,75 +458,75 @@ def _apply_postprocess_step(self, row, step): logger.debug("Unknown postprocess operation: %r", operation) return row - - # _pp_force_array_value removed: array enforcement is handled by schema and prompt only - def _pp_bounded_time_from_summary(self, row, step): - """Prefer bounded time windows from summary over fallback values""" - + """ + Prefer bounded time windows from summary over fallback + values + """ feature = (row.get("feature") or "").casefold() source_field = step.get("source_field", "summary") source_text = row.get(source_field) or "" time_values = self._extract_times_from_text(source_text) - for pair in step.get("feature_pairs") or []: - start_feature = ( - pair.get("start_feature", "").casefold() - ) + start_feature = pair.get("start_feature", "").casefold() end_feature = pair.get("end_feature", "").casefold() if feature not in {start_feature, end_feature}: continue - if "units" in pair: row["units"] = pair.get("units") - fallback_values = { str(v) for v in pair.get("fallback_values", ["00:00", "24:00"]) } - if len(time_values) < 2 or str(row.get("value")) not in fallback_values: + if ( + len(time_values) < _MAGIC_NEIGHBOR_CHUNK_COUNT + or str(row.get("value")) not in fallback_values + ): return row - if feature == start_feature: row["value"] = min(time_values) elif feature == end_feature: row["value"] = max(time_values) return row - return row - - # _normalize_string_list_value removed: array enforcement is handled by schema and prompt only - @staticmethod def _extract_times_from_text(text): - """Extract times from text as normalized 24-hour HH:MM strings""" - + """ + Extract times from text as normalized 24-hour HH:MM + strings + """ if not text: return [] - ampm_pattern = re.compile( r"(? 12 or minute < 0 or minute > 59: + if ( + hour < 1 + or hour > _MAGIC_HOUR_12 + or minute < 0 + or minute > _MAGIC_MINUTE_59 + ): continue - if ampm == "am": - hour = 0 if hour == 12 else hour + hour = 0 if hour == _MAGIC_HOUR_12 else hour else: - hour = 12 if hour == 12 else hour + 12 - + hour = ( + _MAGIC_HOUR_12 + if hour == _MAGIC_HOUR_12 + else hour + _MAGIC_HOUR_12 + ) out.append(f"{hour:02d}:{minute:02d}") - - for match in hhmm_pattern.finditer(text): - out.append(f"{int(match.group(1)):02d}:{int(match.group(2)):02d}") - + out.extend( + [ + f"{int(match.group(1)):02d}:{int(match.group(2)):02d}" + for match in hhmm_pattern.finditer(text) + ] + ) return sorted(set(out)) From ca3aaac93d8dab5503036be7f2a9bb749bfef0d3 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:06:15 -0600 Subject: [PATCH 17/21] fix ruff errors in process.py --- compass/_cli/process.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/compass/_cli/process.py b/compass/_cli/process.py index 89a71112..5a062c7f 100644 --- a/compass/_cli/process.py +++ b/compass/_cli/process.py @@ -75,7 +75,9 @@ def process(config, verbose, no_progress, plugin, out_dir_exists): out_dir_policy = "prompt" else: out_dir_policy = "fail" - config["out_dir"] = _resolve_out_dir_conflict(config["out_dir"], out_dir_policy) + config["out_dir"] = _resolve_out_dir_conflict( + config["out_dir"], out_dir_policy + ) if plugin is not None: create_schema_based_one_shot_extraction_plugin( @@ -219,7 +221,10 @@ def _resolve_out_dir_conflict(out_dir, policy): def _next_versioned_directory(out_dir): - """Create the next available output directory suffix with versioning""" + """ + Create the next available output directory suffix with + versioning + """ idx = 2 while True: candidate = out_dir.parent / f"{out_dir.name}_v{idx}" From 7ea26279aebe19b5756b88b151069fc57891bc8b Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Tue, 31 Mar 2026 10:15:02 -0600 Subject: [PATCH 18/21] ruff check on updated files --- compass/plugin/one_shot/components.py | 5 +---- tests/python/unit/scripts/test_cli_process.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/compass/plugin/one_shot/components.py b/compass/plugin/one_shot/components.py index 3a831de9..6141b153 100644 --- a/compass/plugin/one_shot/components.py +++ b/compass/plugin/one_shot/components.py @@ -213,10 +213,7 @@ def _store_chunk(self, parser, chunk_ind): """Store chunk and its neighbors if it is not already stored""" for offset in range(1 - parser.num_to_recall, 2): ind_to_grab = chunk_ind + offset - if ( - ind_to_grab < 0 - or ind_to_grab >= len(parser.text_chunks) - ): + if ind_to_grab < 0 or ind_to_grab >= len(parser.text_chunks): continue self._chunks.setdefault( ind_to_grab, parser.text_chunks[ind_to_grab] diff --git a/tests/python/unit/scripts/test_cli_process.py b/tests/python/unit/scripts/test_cli_process.py index a1c1cb05..a25a7fd6 100644 --- a/tests/python/unit/scripts/test_cli_process.py +++ b/tests/python/unit/scripts/test_cli_process.py @@ -146,9 +146,11 @@ def test_process_uses_prompt_policy_in_interactive_terminal( lambda *_, **__: confirmed.append(True) or True, ) - result = process_module._resolve_out_dir_conflict.__wrapped__ \ - if hasattr(process_module._resolve_out_dir_conflict, "__wrapped__") \ + result = ( + process_module._resolve_out_dir_conflict.__wrapped__ + if hasattr(process_module._resolve_out_dir_conflict, "__wrapped__") else None + ) policy = "prompt" if process_module.sys.stdin.isatty() else "fail" assert policy == "prompt" @@ -170,8 +172,10 @@ def test_process_flag_overrides_tty_detection(tmp_path, monkeypatch): out_dir.mkdir() explicit_flag = "increment" - policy = explicit_flag if explicit_flag else ( - "prompt" if process_module.sys.stdin.isatty() else "fail" + policy = ( + explicit_flag + if explicit_flag + else ("prompt" if process_module.sys.stdin.isatty() else "fail") ) result = _resolve_out_dir_conflict(out_dir, policy) assert result == tmp_path / "outputs_v2" From 197a98cd241dd5eabf3b5e9bd3785cee7134d4de Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Mar 2026 11:00:30 -0600 Subject: [PATCH 19/21] Bump codecov/codecov-action from 5 to 6 (#403) Bumps [codecov/codecov-action](https://github.com/codecov/codecov-action) from 5 to 6. - [Release notes](https://github.com/codecov/codecov-action/releases) - [Changelog](https://github.com/codecov/codecov-action/blob/main/CHANGELOG.md) - [Commits](https://github.com/codecov/codecov-action/compare/v5...v6) --- updated-dependencies: - dependency-name: codecov/codecov-action dependency-version: '6' dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/codecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 02f05bfd..bb5d042f 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -44,7 +44,7 @@ jobs: pixi run -e pdev --locked tests-p - name: Upload coverage to Codecov - uses: codecov/codecov-action@v5 + uses: codecov/codecov-action@v6 with: token: ${{ secrets.CODECOV_TOKEN }} files: ./coverage.xml From a9c8929d135549515119a7548319d5d80c6093d6 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 31 Mar 2026 11:12:54 -0600 Subject: [PATCH 20/21] Update tox file --- tox.ini | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index b6e74117..10e36a0f 100644 --- a/tox.ini +++ b/tox.ini @@ -44,4 +44,6 @@ deps= pytest>=8.0 [testenv:latest] -description = no constraints, thus latest version of dependencies +description = (almost) no constraints, thus latest version of dependencies +deps= + playwright~=1.49.0 # playwright has to be pinned From 314af432746ec2472fe0ac42cc5759d228620461 Mon Sep 17 00:00:00 2001 From: ppinchuk Date: Tue, 31 Mar 2026 11:46:12 -0600 Subject: [PATCH 21/21] Update tox file --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 10e36a0f..f1b3df6d 100644 --- a/tox.ini +++ b/tox.ini @@ -46,4 +46,4 @@ deps= [testenv:latest] description = (almost) no constraints, thus latest version of dependencies deps= - playwright~=1.49.0 # playwright has to be pinned + crawl4ai<=0.72.0