diff --git a/src/pyosmeta/models/base.py b/src/pyosmeta/models/base.py index 1458e38..c93983e 100644 --- a/src/pyosmeta/models/base.py +++ b/src/pyosmeta/models/base.py @@ -324,6 +324,11 @@ class ReviewModel(BaseModel): gh_meta: Optional[GhMeta] = None labels: list[str] = Field(default_factory=list) active: bool = True # To indicate if package is maintained or archived + # Generative AI disclosure (from "Development Best Practices & GenerativeAI + # Use Disclosure" section; None when section is absent) + genai_used: Optional[bool] = None + genai_tools: Optional[str] = None + genai_scope: Optional[str] = None @model_validator(mode="after") def set_repository_host_from_link(self): diff --git a/src/pyosmeta/parse_issues.py b/src/pyosmeta/parse_issues.py index d287c6d..88b9526 100644 --- a/src/pyosmeta/parse_issues.py +++ b/src/pyosmeta/parse_issues.py @@ -209,6 +209,8 @@ def _postprocess_meta(self, meta: dict, body: List[str]) -> dict: meta["partners"] = self.get_categories( body, "## Community Partnerships", 3, keyed=True ) + genai = self.get_genai_disclosure(body) + meta.update(genai) if "joss_doi" in meta: # Normalize the JOSS archive field. Some issues use `JOSS DOI` others `JOSS` meta["joss"] = meta.pop("joss_doi") @@ -394,6 +396,77 @@ def get_repo_paths( all_repos[a_package] = {"owner": owner, "repo_name": repo} return all_repos + def get_genai_disclosure(self, issue_list: list[str]) -> dict[str, Any]: + """Parse the Development Best Practices & GenerativeAI Use Disclosure + section from the issue body. + + Extracts whether GenAI was used (checkbox), the listed tools/frameworks, + and the description of nature/scope of support. Returns all None if the + section is absent (e.g. older submissions). + + Parameters + ---------- + issue_list : list[str] + The issue body split into lines (after the first ---). + + Returns + ------- + dict + Keys: genai_used (bool or None), genai_tools (str or None), + genai_scope (str or None). + """ + result = { + "genai_used": None, + "genai_tools": None, + "genai_scope": None, + } + section_index = None + for i, line in enumerate(issue_list): + if "## " in line and ( + "Development Best Practices" in line or "GenerativeAI" in line + ): + section_index = i + break + if section_index is None: + return result + + # Find the "Generative AI tools were used" checkbox in this section + genai_checkbox = "Generative AI tools were used" + for i in range(section_index + 1, len(issue_list)): + line = issue_list[i] + if line.strip().startswith("## "): + break + if genai_checkbox in line: + result["genai_used"] = bool(re.search(r"-\s*\[[xX]\]", line)) + break + + def _collect_after_subheading(needle: str) -> str | None: + idx = None + for i, line in enumerate(issue_list): + if "### " in line and needle in line: + idx = i + break + if idx is None: + return None + lines = [] + for i in range(idx + 1, len(issue_list)): + line = issue_list[i] + if line.strip().startswith("### ") or line.strip().startswith( + "## " + ): + break + lines.append(line) + text = "\n".join(lines).strip() + return text if text else None + + result["genai_tools"] = _collect_after_subheading( + "Please list the tools" + ) + result["genai_scope"] = _collect_after_subheading( + "Describe the nature and scope" + ) + return result + def get_categories( self, issue_list: list[str], diff --git a/src/pyosmeta/utils_clean.py b/src/pyosmeta/utils_clean.py index 8801f3f..ee30f5f 100644 --- a/src/pyosmeta/utils_clean.py +++ b/src/pyosmeta/utils_clean.py @@ -221,6 +221,8 @@ def clean_archive(archive): if not archive: # If field is empty, return None return None + if archive.lower() in ("n/a", "tbd"): + return None if archive.startswith("[") and archive.endswith(")"): # Extract the outermost link link = archive[archive.rfind("](") + 2 : -1] @@ -237,10 +239,6 @@ def clean_archive(archive): logger.warning(f"Invalid archive URL (not resolving): {archive}") # raise ValueError(f"Invalid archive URL (not resolving): {archive}") return archive - elif archive.lower() == "n/a": - return None - elif archive.lower() == "tbd": - return None else: raise ValueError(f"Invalid archive URL: {archive}") diff --git a/tests/data/reviews/submission_with_genai_section.txt b/tests/data/reviews/submission_with_genai_section.txt new file mode 100644 index 0000000..474a85d --- /dev/null +++ b/tests/data/reviews/submission_with_genai_section.txt @@ -0,0 +1,68 @@ +Submitting Author: Author Name (@username) +All current maintainers: (@username, @username2) +Package Name: genai_test_package +One-Line Description of Package: A package that used GenAI during development +Repository Link: https://github.com/username/genai_test_package +Version submitted: v.0.1.0 +Editor: @editoruser +Reviewer 1: @reviewer1 +Reviewer 2: @reviewer2 +Archive: TBD +JOSS DOI: TBD +Version accepted: v.0.1.0 +Date accepted (month/day/year): 04/21/2024 + +--- + +## Code of Conduct & Commitment to Maintain Package + +- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted. +- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment]. + +## Development Best Practices & GenerativeAI Use Disclosure + +- [x] This package has a public development history spanning 3-6 months, with commits distributed over time that reflect **iterative, thoughtful development.** +- [x] All code in this package has been **carefully reviewed by a human**. Its implementation is also understood by the authors submitting the package. +- [x] All communication on this issue will be written by a human (someone on your maintainer team). We embrace the use of LLMs for translation and grammar correction. We prefer honest interactions over ones that prioritize perfect language and grammar. As little aid from a LLM as possible. +- [x] **Generative AI tools were used to develop and maintain this package.** + +### Please list the tools and frameworks that you used below (Examples include Claude Code, Cursor, OpenClaw, ChatGPT, VSCode + Copilot) + +Cursor, GitHub Copilot for autocomplete. + +### Describe the nature and scope of support that LLMs provided. Examples include code generation, autocomplete, documentation development, refactoring, test development + +Used for autocomplete and documentation drafting. No code generation. + +## Description + +Description of package that used GenAI during development. + +## Scope + +- Please indicate which category or categories. +Check out our [package scope page][PackageCategories] to learn more about our +scope. (If you are unsure of which category you fit, we suggest you make a pre-submission inquiry): + + - [ ] Data retrieval + - [ ] Data extraction + - [x] Data processing/munging + - [ ] Data deposition + - [ ] Data validation and testing + - [ ] Data visualization[^1] + - [ ] Workflow automation + - [ ] Citation management and bibliometrics + - [ ] Scientific software wrappers + - [ ] Database interoperability + +## Domain Specific + +- [ ] Geospatial +- [ ] Education + +## Community Partnerships +If your package is associated with an +existing community please check below: + +- [x] Astropy: Link coming soon to standards +- [ ] Pangeo: My package adheres to the [Pangeo standards listed in the pyOpenSci peer review guidebook][PangeoCollaboration] diff --git a/tests/integration/test_parse_issues.py b/tests/integration/test_parse_issues.py index 9d0db36..ac8179d 100644 --- a/tests/integration/test_parse_issues.py +++ b/tests/integration/test_parse_issues.py @@ -161,3 +161,34 @@ def test_repository_host_gitlab(process_issues, data_file): review = data_file("reviews/gitlab_submission.txt", True) review = process_issues.parse_issue(review) assert review.repository_host == "gitlab" + + +def test_parse_submission_with_genai_section(process_issues, data_file): + """ + Integration test: full template ingest with the Development Best Practices + & GenerativeAI Use Disclosure section (real fixture data). + + Ensures that adding the GenAI section to the submission template does not + break parsing, and that genai_used, genai_tools, and genai_scope are + extracted correctly alongside categories and partners. + """ + body = data_file("reviews/submission_with_genai_section.txt", True) + review = process_issues.parse_issue(body) + + assert review.package_name == "genai_test_package" + assert review.genai_used is True + assert review.genai_tools is not None + assert "Cursor" in review.genai_tools + assert "Copilot" in review.genai_tools + assert review.genai_scope is not None + assert "autocomplete" in review.genai_scope + assert "documentation" in review.genai_scope + + # Scope and Community Partnerships still parse correctly after GenAI section + assert review.categories is not None + assert "data-processing-munging" in review.categories + assert review.partners is not None + partner_values = [ + p.value if hasattr(p, "value") else p for p in (review.partners or []) + ] + assert "astropy" in partner_values