Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/check-release.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ jobs:
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- uses: r-lib/actions/setup-r@v1
- uses: actions/checkout@v4
- uses: r-lib/actions/setup-r@v2
- name: Install dependencies
run: |
install.packages(c("remotes", "rcmdcheck"))
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/render-rmarkdown.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@ jobs:
env:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
with:
fetch-depth: 0
- uses: r-lib/actions/setup-r@v1
- uses: r-lib/actions/setup-pandoc@v1
- uses: r-lib/actions/setup-r@v2
- uses: r-lib/actions/setup-pandoc@v2
- name: Install packages
run: Rscript -e 'install.packages(c("devtools", "rmarkdown", "ggplot2", "dplyr", "tidytext", "stopwords"))'
- name: Install local package
Expand Down
38 changes: 28 additions & 10 deletions .github/workflows/schedule-commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ jobs:
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4

- uses: r-lib/actions/setup-r@master
- uses: r-lib/actions/setup-r@v2
with:
r-version: ${{ matrix.config.r }}
http-user-agent: ${{ matrix.config.http-user-agent }}

- uses: r-lib/actions/setup-pandoc@master
- uses: r-lib/actions/setup-pandoc@v2

- name: Query dependencies
run: |
Expand All @@ -45,7 +45,7 @@ jobs:
shell: Rscript {0}

- name: Cache R packages
uses: actions/cache@v1
uses: actions/cache@v3
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }}
Expand All @@ -60,15 +60,33 @@ jobs:
- name: Generate data
run: |
source("scripts/update_data.R")
shell: Rscript {0}

- name: Commit files
shell: Rscript {0}

- name: Check for changes and create commit message
id: changes
run: |
git config --local user.email "actions@github.com"
git config --local user.name "GitHub Actions"
git add --all
git commit -am "add data"
git push
git add data/

# Check if there are changes to commit
if git diff --staged --quiet; then
echo "has_changes=false" >> $GITHUB_OUTPUT
echo "No changes to commit"
else
echo "has_changes=true" >> $GITHUB_OUTPUT
# Get commit message from R script
COMMIT_MSG=$(Rscript scripts/count_new_episodes.R)
echo "commit_message=$COMMIT_MSG" >> $GITHUB_OUTPUT
echo "Commit message: $COMMIT_MSG"
fi

- name: Commit and push changes
if: steps.changes.outputs.has_changes == 'true'
run: |
git commit -m "Update transcripts: ${{ steps.changes.outputs.commit_message }}"
git push

- name: Session info
run: |
options(width = 100)
Expand Down
52 changes: 52 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,57 @@
# R user files
.Rproj.user
.Rhistory
.RData
.Rdata

# OAuth tokens
.httr-oauth

# knitr and R markdown default cache directories
*_cache/
/cache/

# Temporary files created by R markdown
*.utf8.md
*.knit.md

# R Environment Variables
.Renviron

# pkgdown site
docs/
doc/
Meta/

# translation temp files
po/*~

# RStudio files
.Rproj.user/
*.Rproj

# produced vignettes
vignettes/*.html
vignettes/*.pdf

# R check outputs
*.Rcheck/

# Package build artifacts
*.tar.gz
*.tgz

# MacOS
.DS_Store

# GitHub dependencies
depends.Rds
.github/depends.Rds
.github/R-version

# IDE
.vscode/
.idea/

# Test outputs
tests/testthat/_snaps/
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@ License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
RoxygenNote: 7.3.0
URL: https://github.com/quickcoffee/coronavirusupdate
BugReports: https://github.com/quickcoffee/coronavirusupdate/issues
Suggests:
Suggests:
spelling,
ggplot2,
tidytext
tidytext,
testthat (>= 3.0.0)
Language: en-US
Imports:
magrittr,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(scrape_coronavirusupdate)
importFrom(magrittr,"%>%")
58 changes: 58 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# coronavirusupdate 0.0.1.9000 (Development version)

## Major Improvements

### Code Quality & Reliability
* Added comprehensive error handling to all scraping functions with informative error messages
* Implemented input validation across all functions
* Added graceful handling of NULL inputs and empty results
* Scraping functions now provide detailed warnings when extraction fails

### Documentation
* Added complete roxygen2 documentation to all functions
* Improved function descriptions with parameter details and return values
* Added usage examples and implementation details
* All internal functions now properly documented with @keywords internal

### Testing
* Set up testthat testing framework
* Added unit tests for all extraction functions
* Added data validation tests to ensure data quality
* Added input validation tests for main scraping function
* Created test suite for edge cases and error handling

### GitHub Actions & Automation
* Updated all GitHub Actions to latest versions (checkout@v4, cache@v3, setup-r@v2)
* Improved workflow to skip commits when no new data is available
* Enhanced commit messages to show number of new episodes added
* Added helper script to count new episodes for informative commit messages

### Package Infrastructure
* Updated .gitignore with standard R package exclusions
* Added NEWS.md for tracking package changes
* Updated DESCRIPTION with testthat dependency
* Improved RoxygenNote to version 7.2.3
Copy link

Copilot AI Nov 6, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The NEWS.md states RoxygenNote was updated to 7.2.3, but DESCRIPTION file shows 7.3.0, creating an inconsistency. These version numbers should match.

Suggested change
* Improved RoxygenNote to version 7.2.3
* Improved RoxygenNote to version 7.3.0

Copilot uses AI. Check for mistakes.

### Data Quality
* Maintained existing speaker name normalization
* Preserved incremental scraping functionality
* Kept multi-format output support (RDS, RDA, Parquet)

## Bug Fixes
* Fixed potential crashes from NULL HTML responses
* Improved handling of malformed or changed website structure
* Better error messages for debugging scraping failures

---

# coronavirusupdate 0.0.1

## Initial Release

* Initial package release
* Scraping functionality for NDR Coronavirus-Update podcast transcripts
* Incremental scraping support (only fetches new episodes)
* Speaker name normalization
* Multiple output formats (RDS, RDA, Parquet)
* Automated weekly updates via GitHub Actions
* Tidy data format with one row per paragraph
41 changes: 36 additions & 5 deletions R/extract_episode_length.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,38 @@
# TODO get episode length from iframe player rather than html_node
#' Extract episode length from episode HTML
#'
#' Extracts the duration of a podcast episode from the HTML content.
#' The duration is typically found in parentheses in the h2 element.
#'
#' @param .episode_html An xml_document object containing the episode HTML,
#' typically obtained via \code{xml2::read_html()}
#'
#' @return A character string containing the episode duration, or NA_character_
#' if the duration cannot be extracted
#'
#' @note TODO: Consider getting episode length from iframe player rather than html_node
#' for more reliable extraction
#'
#' @keywords internal
extract_episode_length <- function(.episode_html) {
.episode_html %>%
rvest::html_node(css = ".textcontent h2") %>%
rvest::html_text() %>%
stringr::str_extract(pattern = "(?<=\\().{2,20}(?=\\)$)")
tryCatch({
if (is.null(.episode_html)) {
warning("Episode HTML is NULL, returning NA for episode length")
return(NA_character_)
}

result <- .episode_html %>%
rvest::html_node(css = ".textcontent h2") %>%
rvest::html_text() %>%
stringr::str_extract(pattern = "(?<=\\().{2,20}(?=\\)$)")

if (is.na(result) || length(result) == 0) {
warning("Could not extract episode length from HTML")
return(NA_character_)
}

return(result)
}, error = function(e) {
warning(paste("Error extracting episode length:", e$message))
return(NA_character_)
})
}
42 changes: 35 additions & 7 deletions R/extract_last_change.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,37 @@
#' Extract last change date from episode HTML
#'
#' Extracts and parses the last modification date of a podcast episode transcript
#' from the HTML content. The date is parsed from German date format.
#'
#' @param .episode_html An xml_document object containing the episode HTML,
#' typically obtained via \code{xml2::read_html()}
#'
#' @return A POSIXct datetime object representing when the transcript was last
#' modified, or NA if the date cannot be extracted or parsed
#'
#' @keywords internal
extract_last_change <- function(.episode_html) {
.episode_html %>%
rvest::html_node(css = ".lastchanged") %>%
rvest::html_text() %>%
stringr::str_remove(pattern = "[:alpha:]+[:punct:]") %>%
stringr::str_remove(pattern = "Uhr") %>%
stringr::str_squish() %>%
lubridate::dmy_hm()
tryCatch({
if (is.null(.episode_html)) {
warning("Episode HTML is NULL, returning NA for last change date")
return(lubridate::as_datetime(NA))
}

result <- .episode_html %>%
rvest::html_node(css = ".lastchanged") %>%
rvest::html_text() %>%
stringr::str_remove(pattern = "[:alpha:]+[:punct:]") %>%
stringr::str_remove(pattern = "Uhr") %>%
stringr::str_squish() %>%
lubridate::dmy_hm()

if (is.na(result)) {
warning("Could not parse last change date from HTML")
}

return(result)
}, error = function(e) {
warning(paste("Error extracting last change date:", e$message))
return(lubridate::as_datetime(NA))
})
}
41 changes: 32 additions & 9 deletions R/extract_speaker_names.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,34 @@
#' Extract speaker names from transcript nodes
#'
#' Extracts and cleans speaker names from HTML transcript nodes. Speaker names
#' are identified by strong tags and specific text patterns (capitalized text
#' ending with a colon). Includes manual fixes for known edge cases.
#'
#' @param .transcript_nodes An xml_nodeset containing the transcript paragraph
#' nodes, typically obtained via \code{extract_transcript_nodes()}
#'
#' @return A character vector of speaker names, with NA for paragraphs without
#' identified speakers, or an empty character vector if extraction fails
#'
#' @keywords internal
extract_speaker_name <- function(.transcript_nodes) {
rvest::html_node(x = .transcript_nodes, xpath = "strong") %>%
rvest::html_text(trim = TRUE) %>%
stringr::str_squish() %>%
stringr::str_extract(pattern = "^[:upper:][:alpha:]+.+\\:$") %>%
stringr::str_remove(pattern = ":") %>%
# manual fix for episode 38
stringr::str_replace(pattern = "Eine Bitte an unsere HΓΆrer", replacement = "Korinna Hennig") %>%
stringr::str_squish() %>%
dplyr::na_if(y = "")
tryCatch({
if (is.null(.transcript_nodes) || length(.transcript_nodes) == 0) {
warning("Transcript nodes are NULL or empty, returning empty character vector")
return(character(0))
}

rvest::html_node(x = .transcript_nodes, xpath = "strong") %>%
rvest::html_text(trim = TRUE) %>%
stringr::str_squish() %>%
stringr::str_extract(pattern = "^[:upper:][:alpha:]+.+\\:$") %>%
stringr::str_remove(pattern = ":") %>%
# manual fix for episode 38
stringr::str_replace(pattern = "Eine Bitte an unsere HΓΆrer", replacement = "Korinna Hennig") %>%
stringr::str_squish() %>%
dplyr::na_if(y = "")
}, error = function(e) {
warning(paste("Error extracting speaker names:", e$message))
return(character(0))
})
}
Loading
Loading