quickcoffee · quickcoffee · Nov 6, 2025 · Nov 6, 2025 · Copilot · Nov 6, 2025
diff --git a/.github/workflows/check-release.yaml b/.github/workflows/check-release.yaml
@@ -18,8 +18,8 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
-      - uses: r-lib/actions/setup-r@v1
+      - uses: actions/checkout@v4
+      - uses: r-lib/actions/setup-r@v2
       - name: Install dependencies
         run: |
           install.packages(c("remotes", "rcmdcheck"))

diff --git a/.github/workflows/render-rmarkdown.yaml b/.github/workflows/render-rmarkdown.yaml
@@ -21,11 +21,11 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
-      - uses: r-lib/actions/setup-r@v1
-      - uses: r-lib/actions/setup-pandoc@v1
+      - uses: r-lib/actions/setup-r@v2
+      - uses: r-lib/actions/setup-pandoc@v2
       - name: Install packages
         run: Rscript -e 'install.packages(c("devtools", "rmarkdown", "ggplot2", "dplyr", "tidytext", "stopwords"))'
       - name: Install local package

diff --git a/.github/workflows/schedule-commit.yaml b/.github/workflows/schedule-commit.yaml
@@ -27,14 +27,14 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
-      - uses: r-lib/actions/setup-r@master
+      - uses: r-lib/actions/setup-r@v2
         with:
           r-version: ${{ matrix.config.r }}
           http-user-agent: ${{ matrix.config.http-user-agent }}
 
-      - uses: r-lib/actions/setup-pandoc@master
+      - uses: r-lib/actions/setup-pandoc@v2
 
       - name: Query dependencies
         run: |
@@ -45,7 +45,7 @@ jobs:
         shell: Rscript {0}
 
       - name: Cache R packages
-        uses: actions/cache@v1
+        uses: actions/cache@v3
         with:
           path: ${{ env.R_LIBS_USER }}
           key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
@@ -60,15 +60,33 @@ jobs:
       - name: Generate data
         run: |
           source("scripts/update_data.R")
-        shell: Rscript {0} 
-
-      - name: Commit files
+        shell: Rscript {0}
+
+      - name: Check for changes and create commit message
+        id: changes
         run: |
           git config --local user.email "actions@github.com"
           git config --local user.name "GitHub Actions"
-          git add --all
-          git commit -am "add data"
-          git push 
+          git add data/
+
+          # Check if there are changes to commit
+          if git diff --staged --quiet; then
+            echo "has_changes=false" >> $GITHUB_OUTPUT
+            echo "No changes to commit"
+          else
+            echo "has_changes=true" >> $GITHUB_OUTPUT
+            # Get commit message from R script
+            COMMIT_MSG=$(Rscript scripts/count_new_episodes.R)
+            echo "commit_message=$COMMIT_MSG" >> $GITHUB_OUTPUT
+            echo "Commit message: $COMMIT_MSG"
+          fi
+
+      - name: Commit and push changes
+        if: steps.changes.outputs.has_changes == 'true'
+        run: |
+          git commit -m "Update transcripts: ${{ steps.changes.outputs.commit_message }}"
+          git push
+
       - name: Session info
         run: |
           options(width = 100)

diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,57 @@
+# R user files
 .Rproj.user
 .Rhistory
+.RData
+.Rdata
+
+# OAuth tokens
 .httr-oauth
+
+# knitr and R markdown default cache directories
+*_cache/
+/cache/
+
+# Temporary files created by R markdown
+*.utf8.md
+*.knit.md
+
+# R Environment Variables
+.Renviron
+
+# pkgdown site
+docs/
+doc/
+Meta/
+
+# translation temp files
+po/*~
+
+# RStudio files
+.Rproj.user/
+*.Rproj
+
+# produced vignettes
+vignettes/*.html
+vignettes/*.pdf
+
+# R check outputs
+*.Rcheck/
+
+# Package build artifacts
+*.tar.gz
+*.tgz
+
+# MacOS
 .DS_Store
+
+# GitHub dependencies
 depends.Rds
+.github/depends.Rds
+.github/R-version
+
+# IDE
+.vscode/
+.idea/
+
+# Test outputs
+tests/testthat/_snaps/
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,13 +11,14 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.1.1
+RoxygenNote: 7.3.0
 URL: https://github.com/quickcoffee/coronavirusupdate
 BugReports: https://github.com/quickcoffee/coronavirusupdate/issues
-Suggests: 
+Suggests:
     spelling,
     ggplot2,
-    tidytext
+    tidytext,
+    testthat (>= 3.0.0)
 Language: en-US
 Imports: 
     magrittr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4 +1,5 @@
 # Generated by roxygen2: do not edit by hand
 
 export("%>%")
+export(scrape_coronavirusupdate)
 importFrom(magrittr,"%>%")
diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,58 @@
+# coronavirusupdate 0.0.1.9000 (Development version)
+
+## Major Improvements
+
+### Code Quality & Reliability
+* Added comprehensive error handling to all scraping functions with informative error messages
+* Implemented input validation across all functions
+* Added graceful handling of NULL inputs and empty results
+* Scraping functions now provide detailed warnings when extraction fails
+
+### Documentation
+* Added complete roxygen2 documentation to all functions
+* Improved function descriptions with parameter details and return values
+* Added usage examples and implementation details
+* All internal functions now properly documented with @keywords internal
+
+### Testing
+* Set up testthat testing framework
+* Added unit tests for all extraction functions
+* Added data validation tests to ensure data quality
+* Added input validation tests for main scraping function
+* Created test suite for edge cases and error handling
+
+### GitHub Actions & Automation
+* Updated all GitHub Actions to latest versions (checkout@v4, cache@v3, setup-r@v2)
+* Improved workflow to skip commits when no new data is available
+* Enhanced commit messages to show number of new episodes added
+* Added helper script to count new episodes for informative commit messages
+
+### Package Infrastructure
+* Updated .gitignore with standard R package exclusions
+* Added NEWS.md for tracking package changes
+* Updated DESCRIPTION with testthat dependency
+* Improved RoxygenNote to version 7.2.3
-* Improved RoxygenNote to version 7.2.3
+* Improved RoxygenNote to version 7.3.0
-* Improved RoxygenNote to version 7.2.3
+* Improved RoxygenNote to version 7.3.0
+
+### Data Quality
+* Maintained existing speaker name normalization
+* Preserved incremental scraping functionality
+* Kept multi-format output support (RDS, RDA, Parquet)
+
+## Bug Fixes
+* Fixed potential crashes from NULL HTML responses
+* Improved handling of malformed or changed website structure
+* Better error messages for debugging scraping failures
+
+---
+
+# coronavirusupdate 0.0.1
+
+## Initial Release
+
+* Initial package release
+* Scraping functionality for NDR Coronavirus-Update podcast transcripts
+* Incremental scraping support (only fetches new episodes)
+* Speaker name normalization
+* Multiple output formats (RDS, RDA, Parquet)
+* Automated weekly updates via GitHub Actions
+* Tidy data format with one row per paragraph
diff --git a/R/extract_episode_length.R b/R/extract_episode_length.R
@@ -1,7 +1,38 @@
-# TODO get episode length from iframe player rather than html_node
+#' Extract episode length from episode HTML
+#'
+#' Extracts the duration of a podcast episode from the HTML content.
+#' The duration is typically found in parentheses in the h2 element.
+#'
+#' @param .episode_html An xml_document object containing the episode HTML,
+#'   typically obtained via \code{xml2::read_html()}
+#'
+#' @return A character string containing the episode duration, or NA_character_
+#'   if the duration cannot be extracted
+#'
+#' @note TODO: Consider getting episode length from iframe player rather than html_node
+#'   for more reliable extraction
+#'
+#' @keywords internal
 extract_episode_length <- function(.episode_html) {
-  .episode_html %>%
-    rvest::html_node(css = ".textcontent h2") %>%
-    rvest::html_text() %>%
-    stringr::str_extract(pattern = "(?<=\\().{2,20}(?=\\)$)")
+  tryCatch({
+    if (is.null(.episode_html)) {
+      warning("Episode HTML is NULL, returning NA for episode length")
+      return(NA_character_)
+    }
+
+    result <- .episode_html %>%
+      rvest::html_node(css = ".textcontent h2") %>%
+      rvest::html_text() %>%
+      stringr::str_extract(pattern = "(?<=\\().{2,20}(?=\\)$)")
+
+    if (is.na(result) || length(result) == 0) {
+      warning("Could not extract episode length from HTML")
+      return(NA_character_)
+    }
+
+    return(result)
+  }, error = function(e) {
+    warning(paste("Error extracting episode length:", e$message))
+    return(NA_character_)
+  })
 }
diff --git a/R/extract_last_change.R b/R/extract_last_change.R
@@ -1,9 +1,37 @@
+#' Extract last change date from episode HTML
+#'
+#' Extracts and parses the last modification date of a podcast episode transcript
+#' from the HTML content. The date is parsed from German date format.
+#'
+#' @param .episode_html An xml_document object containing the episode HTML,
+#'   typically obtained via \code{xml2::read_html()}
+#'
+#' @return A POSIXct datetime object representing when the transcript was last
+#'   modified, or NA if the date cannot be extracted or parsed
+#'
+#' @keywords internal
 extract_last_change <- function(.episode_html) {
-  .episode_html %>%
-    rvest::html_node(css = ".lastchanged") %>%
-    rvest::html_text() %>%
-    stringr::str_remove(pattern = "[:alpha:]+[:punct:]") %>%
-    stringr::str_remove(pattern = "Uhr") %>%
-    stringr::str_squish() %>%
-    lubridate::dmy_hm()
+  tryCatch({
+    if (is.null(.episode_html)) {
+      warning("Episode HTML is NULL, returning NA for last change date")
+      return(lubridate::as_datetime(NA))
+    }
+
+    result <- .episode_html %>%
+      rvest::html_node(css = ".lastchanged") %>%
+      rvest::html_text() %>%
+      stringr::str_remove(pattern = "[:alpha:]+[:punct:]") %>%
+      stringr::str_remove(pattern = "Uhr") %>%
+      stringr::str_squish() %>%
+      lubridate::dmy_hm()
+
+    if (is.na(result)) {
+      warning("Could not parse last change date from HTML")
+    }
+
+    return(result)
+  }, error = function(e) {
+    warning(paste("Error extracting last change date:", e$message))
+    return(lubridate::as_datetime(NA))
+  })
 }
diff --git a/R/extract_speaker_names.R b/R/extract_speaker_names.R
@@ -1,11 +1,34 @@
+#' Extract speaker names from transcript nodes
+#'
+#' Extracts and cleans speaker names from HTML transcript nodes. Speaker names
+#' are identified by strong tags and specific text patterns (capitalized text
+#' ending with a colon). Includes manual fixes for known edge cases.
+#'
+#' @param .transcript_nodes An xml_nodeset containing the transcript paragraph
+#'   nodes, typically obtained via \code{extract_transcript_nodes()}
+#'
+#' @return A character vector of speaker names, with NA for paragraphs without
+#'   identified speakers, or an empty character vector if extraction fails
+#'
+#' @keywords internal
 extract_speaker_name <- function(.transcript_nodes) {
-  rvest::html_node(x = .transcript_nodes, xpath = "strong") %>%
-    rvest::html_text(trim = TRUE) %>%
-    stringr::str_squish() %>%
-    stringr::str_extract(pattern = "^[:upper:][:alpha:]+.+\\:$") %>%
-    stringr::str_remove(pattern = ":") %>%
-    # manual fix for episode 38
-    stringr::str_replace(pattern = "Eine Bitte an unsere Hörer", replacement = "Korinna Hennig") %>%
-    stringr::str_squish() %>%
-    dplyr::na_if(y = "")
+  tryCatch({
+    if (is.null(.transcript_nodes) || length(.transcript_nodes) == 0) {
+      warning("Transcript nodes are NULL or empty, returning empty character vector")
+      return(character(0))
+    }
+
+    rvest::html_node(x = .transcript_nodes, xpath = "strong") %>%
+      rvest::html_text(trim = TRUE) %>%
+      stringr::str_squish() %>%
+      stringr::str_extract(pattern = "^[:upper:][:alpha:]+.+\\:$") %>%
+      stringr::str_remove(pattern = ":") %>%
+      # manual fix for episode 38
+      stringr::str_replace(pattern = "Eine Bitte an unsere Hörer", replacement = "Korinna Hennig") %>%
+      stringr::str_squish() %>%
+      dplyr::na_if(y = "")
+  }, error = function(e) {
+    warning(paste("Error extracting speaker names:", e$message))
+    return(character(0))
+  })
 }