From 979e51eb63d2b86ee256e399eac6afe55ee7522d Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Tue, 14 Oct 2025 10:44:54 +0200
Subject: [PATCH 01/10] feat(indico-ingestor): extended ical ingestor to .ics
and Indico
---
.../concerns/ical_ingestor_export_url.rb | 49 +++++
lib/ingestors/concerns/sitemap_helpers.rb | 48 +++++
lib/ingestors/ical_ingestor.rb | 183 +++++++++---------
test/unit/ingestors/ical_ingestor_test.rb | 54 +++++-
4 files changed, 239 insertions(+), 95 deletions(-)
create mode 100644 lib/ingestors/concerns/ical_ingestor_export_url.rb
create mode 100644 lib/ingestors/concerns/sitemap_helpers.rb
diff --git a/lib/ingestors/concerns/ical_ingestor_export_url.rb b/lib/ingestors/concerns/ical_ingestor_export_url.rb
new file mode 100644
index 000000000..65968fa55
--- /dev/null
+++ b/lib/ingestors/concerns/ical_ingestor_export_url.rb
@@ -0,0 +1,49 @@
+# frozen_string_literal: true
+
+module Ingestors
+ module Concerns
+ # Gets the proper URL to export an ics or ical
+ module IcalIngestorExportUrl
+ private
+
+ # 1. If the host includes 'indico', ensures the path ends with '/events.ics'.
+ # 2. If the path already ends with '/events.ics', return as-is.
+ # 3. Otherwise, append '?ical=true' query param if not already present.
+ #
+ # This method never mutates the original URL string.
+ # Returns the updated URL string or nil if input is blank.
+ def to_export(url)
+ return nil if url.blank?
+
+ uri = URI.parse(url)
+ path = uri.path.to_s
+
+ if uri.host&.include?('indico')
+ ensure_events_ics_path(uri)
+ elsif path.match?(%r{/(event|events)\.ics\z})
+ uri.to_s
+ else
+ ensure_ical_query(uri)
+ end
+ end
+
+ # Ensures the Indico URL ends with '/events.ics'
+ def ensure_events_ics_path(uri)
+ if uri.path&.include?('event')
+ uri.path = File.join(uri.path, 'event.ics') unless uri.path.end_with?('/event.ics')
+ elsif uri.path&.include?('category')
+ uri.path = File.join(uri.path, 'events.ics') unless uri.path.end_with?('/events.ics')
+ end
+ uri.to_s
+ end
+
+ # Ensures the URL has '?ical=true' in its query params
+ def ensure_ical_query(uri)
+ query = URI.decode_www_form(uri.query.to_s).to_h
+ query['ical'] = 'true' unless query['ical'] == 'true'
+ uri.query = URI.encode_www_form(query)
+ uri.to_s
+ end
+ end
+ end
+end
diff --git a/lib/ingestors/concerns/sitemap_helpers.rb b/lib/ingestors/concerns/sitemap_helpers.rb
new file mode 100644
index 000000000..c6e1b251d
--- /dev/null
+++ b/lib/ingestors/concerns/sitemap_helpers.rb
@@ -0,0 +1,48 @@
+# frozen_string_literal: true
+
+module Ingestors
+ module Concerns
+ # From a sitemap.{xml|txt} or a single URL, get the list of URLs (= sources)
+ module SitemapHelpers
+ private
+
+ # Reads either a sitemap.{xml|txt} or a single URL
+ # Returns a list of URLs from 1 to n URLs
+ def get_sources(source_url)
+ case source_url.downcase
+ when /sitemap(.*)?\.xml\Z/
+ parse_xml_sitemap(source_url)
+ when /sitemap(.*)?\.txt\Z/
+ parse_txt_sitemap(source_url)
+ else
+ [source_url]
+ end
+ end
+
+ def parse_xml_sitemap(url)
+ urls = SitemapParser.new(
+ url,
+ recurse: true,
+ headers: { 'User-Agent' => config[:user_agent] }
+ ).to_a.uniq.map(&:strip)
+
+ log_sitemap('xml', url, urls.count)
+ urls
+ rescue StandardError => e
+ @messages << "Extract from sitemap[#{url}] failed with: #{e.message}"
+ nil
+ end
+
+ def parse_txt_sitemap(url)
+ urls = open_url(url).to_a.uniq.map(&:strip)
+
+ log_sitemap('txt', url, urls.count)
+ urls
+ end
+
+ def log_sitemap(ext, url, count)
+ @messages << "Parsing .#{ext} sitemap: #{url}\n - #{count} URLs found"
+ end
+ end
+ end
+end
diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb
index e91fdecfa..0621b94f1 100644
--- a/lib/ingestors/ical_ingestor.rb
+++ b/lib/ingestors/ical_ingestor.rb
@@ -1,130 +1,125 @@
+# frozen_string_literal: true
+
require 'icalendar'
require 'nokogiri'
require 'open-uri'
require 'tzinfo'
module Ingestors
+ # Reads from direct ical / .ics / Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps.
class IcalIngestor < Ingestor
+ include Ingestors::Concerns::SitemapHelpers
+ include Ingestors::Concerns::IcalIngestorExportUrl
+
def self.config
{
key: 'ical',
- title: 'iCalendar',
+ title: 'iCalendar / Indico / .ics File',
category: :events
}
end
- def read(url)
- unless url.nil?
- if url.to_s.downcase.end_with? 'sitemap.xml'
- process_sitemap url
- else
- process_icalendar url
- end
+ def read(source_url)
+ @verbose = false
+ sources = get_sources(source_url)
+ return if sources.nil?
+
+ sources.each do |url|
+ process_url(url)
end
end
private
- def process_sitemap(url)
- # find urls for individual icalendar files
- begin
- sitemap = Nokogiri::XML.parse(open_url(url, raise: true))
- locs = sitemap.xpath('/ns:urlset/ns:url/ns:loc', {
- 'ns' => 'http://www.sitemaps.org/schemas/sitemap/0.9'
- })
- locs.each do |loc|
- process_icalendar(loc.text)
- end
- rescue Exception => e
- @messages << "Extract from sitemap[#{url}] failed with: #{e.message}"
+ # Modifies the given URL to the ics or ical export.
+ # Loops into each Ical event to process it.
+ # Note: One .ics file can have multiple Ical events.
+ def process_url(url)
+ export_url = to_export(url)
+ events = Icalendar::Event.parse(open_url(export_url, raise: true).set_encoding('utf-8'))
+ events.each do |e|
+ process_calevent(e)
end
+ rescue StandardError => e
+ @messages << "Process file url[#{export_url}] failed with: #{e.message}"
+ end
- # finished
- nil
+ # Builds the OpenStruct event and adds it in event.
+ def process_calevent(calevent)
+ event_to_add = OpenStruct.new.tap do |event|
+ assign_basic_info(event, calevent)
+ assign_time_info(event, calevent)
+ assign_location_info(event, calevent.location)
+ end
+ add_event(event_to_add)
+ rescue StandardError => e
+ @messages << "Process iCalendar failed with: #{e.message}"
end
- def process_icalendar(url)
- # process individual ics file
- query = '?ical=true'
+ # Assigns to event: url, title, description, keywords.
+ def assign_basic_info(event, calevent)
+ event.url = calevent.url.to_s
+ event.title = calevent.summary.to_s
+ event.description = process_description calevent.description
+ event.keywords = process_keywords(calevent.categories)
+ end
- begin
- # append query (if required)
- file_url = url
- file_url << query unless url.to_s.downcase.ends_with? query
+ # Assigns to event: start, end, timezone.
+ def assign_time_info(event, calevent)
+ event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil?
+ event.end = calevent.dtend&.to_time unless calevent.dtend.nil?
+ event.timezone = get_tzid(calevent.dtstart)
+ end
- # process file
- events = Icalendar::Event.parse(open_url(file_url, raise: true).set_encoding('utf-8'))
+ # Assigns to event: venue, online, city.
+ def assign_location_info(event, location)
+ return if location.blank? || !location.present?
- # process each event
- events.each do |e|
- process_event(e)
- end
- rescue Exception => e
- @messages << "Process file url[#{file_url}] failed with: #{e.message}"
- end
+ event.venue = location.to_s
+ event.online = location.downcase.include?('online')
+ event.city, event.postcode, event.country = process_location(location)
+ end
- # finished
- nil
+ # Removes all `
` tags and converts HTML to MD.
+ def process_description(input)
+ return input if input.nil?
+
+ desc = input.to_s.gsub('', '
')
+ convert_description(desc)
end
- def process_event(calevent)
- # puts "calevent: #{calevent.inspect}"
- begin
- # set fields
- event = OpenStruct.new
- event.url = calevent.url.to_s
- event.title = calevent.summary.to_s
- event.description = process_description calevent.description
-
- # puts "\n\ncalevent.description = #{calevent.description}"
- # puts "\n\n... converted = #{event.description}"
-
- event.end = calevent.dtend&.to_time
- unless calevent.dtstart.nil?
- dtstart = calevent.dtstart
- event.start = dtstart&.to_time
- tzid = dtstart.ical_params['tzid']
- event.timezone = tzid.first.to_s if !tzid.nil? and tzid.size > 0
- end
-
- event.venue = calevent.location.to_s
- if calevent.location.downcase.include?('online')
- event.online = true
- event.city = nil
- event.postcode = nil
- event.country = nil
- else
- location = convert_location(calevent.location)
- event.city = location['suburb'] unless location['suburb'].nil?
- event.country = location['country'] unless location['country'].nil?
- event.postcode = location['postcode'] unless location['postcode'].nil?
- end
- event.keywords = []
- unless calevent.categories.nil? or calevent.categories.first.nil?
- cats = calevent.categories.first
- if cats.is_a?(Icalendar::Values::Array)
- cats.each do |item|
- event.keywords << item.to_s.lstrip
- end
- else
- event.keywords << cats.to_s.strip
- end
- end
-
- # store event
- @events << event
- rescue Exception => e
- @messages << "Process iCalendar failed with: #{e.message}"
- end
+ # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field.
+ # Handles whether tzid shows up as an Array or a single string
+ def get_tzid(dtstart)
+ return nil unless dtstart.respond_to?(:ical_params)
- # finished
- nil
+ tzid = dtstart.ical_params['tzid']
+ return nil if tzid.nil?
+
+ tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s
end
- def process_description(input)
- return input if input.nil?
+ # Returns an array of 3 location characteristics: suburb, postcode, country
+ # Everything is nil if location.blank or location is online
+ def process_location(location)
+ return [nil, nil, nil] if location.blank?
+
+ if location.to_s.downcase.include?('online')
+ [nil, nil, nil]
+ else
+ [
+ location['suburb'],
+ location['postcode'],
+ location['country']
+ ]
+ end
+ end
+
+ # Returns keywords from the `CATEGORIES` ICal field
+ def process_keywords(categories)
+ return [] if categories.blank?
- convert_description(input.to_s.gsub(/\R/, '
'))
+ categories.flatten.compact.map { |cat| cat.to_s.strip }
end
end
end
diff --git a/test/unit/ingestors/ical_ingestor_test.rb b/test/unit/ingestors/ical_ingestor_test.rb
index cb6ff5a54..8a8aaae4c 100644
--- a/test/unit/ingestors/ical_ingestor_test.rb
+++ b/test/unit/ingestors/ical_ingestor_test.rb
@@ -26,7 +26,7 @@ class IcalIngestorTest < ActiveSupport::TestCase
assert ingestor.events.empty?
assert ingestor.materials.empty?
- assert_includes ingestor.messages, 'Extract from sitemap[https://missing.org/sitemap.xml] failed with: 404 '
+ assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:'
end
test 'ingest valid sitemap' do
@@ -187,6 +187,58 @@ class IcalIngestorTest < ActiveSupport::TestCase
end
end
+ test 'process_calevent logs error when exception is raised' do
+ ingestor = Ingestors::IcalIngestor.new
+ calevent = Object.new # fake calevent
+
+ # Stub a method that will raise an error
+ def ingestor.assign_basic_info(*)
+ raise StandardError, 'test failure'
+ end
+
+ ingestor.send(:process_calevent, calevent)
+
+ assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure'
+ end
+
+ test 'to_export method' do
+ ingestor = Ingestors::IcalIngestor.new
+ indico_url_event = 'https://indico.cern.ch/event/1588342/'
+ indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular
+ indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone'
+ indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone'
+ indico_url_category = 'https://indico.cern.ch/category/19377/'
+ indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural
+ indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d'
+ indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d'
+ url_with_ics = 'https://mywebsite.com/event/blabla/events.ics'
+ url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born'
+ url_no_ical = 'https://mywebsite.com/event/blabla'
+ url_with_ical = 'https://mywebsite.com/event/blabla?ical=true'
+
+ # When indico link – event
+ assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics
+ assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics
+
+ # When indico link – category
+ assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics
+ assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics
+
+ # When non-indico link
+ assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same
+ assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same
+
+ # When indico link which already has the /events.ics
+ assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is
+ assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is
+
+ # When other url, adds the ical query param
+ assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical
+
+ # When other url with ical query param, keep it as-is
+ assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical
+ end
+
private
def check_event_exists(title, url)
From 59795140a1ccbb0083ade7a74c702d8595358b62 Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Mon, 27 Oct 2025 14:07:17 +0100
Subject: [PATCH 02/10] review(#1161): path more precise, location check, added
contact
---
.../concerns/ical_ingestor_export_url.rb | 49 -------------
lib/ingestors/ical_ingestor.rb | 69 +++++++++++++------
test/unit/ingestors/ical_ingestor_test.rb | 6 +-
3 files changed, 50 insertions(+), 74 deletions(-)
delete mode 100644 lib/ingestors/concerns/ical_ingestor_export_url.rb
diff --git a/lib/ingestors/concerns/ical_ingestor_export_url.rb b/lib/ingestors/concerns/ical_ingestor_export_url.rb
deleted file mode 100644
index 65968fa55..000000000
--- a/lib/ingestors/concerns/ical_ingestor_export_url.rb
+++ /dev/null
@@ -1,49 +0,0 @@
-# frozen_string_literal: true
-
-module Ingestors
- module Concerns
- # Gets the proper URL to export an ics or ical
- module IcalIngestorExportUrl
- private
-
- # 1. If the host includes 'indico', ensures the path ends with '/events.ics'.
- # 2. If the path already ends with '/events.ics', return as-is.
- # 3. Otherwise, append '?ical=true' query param if not already present.
- #
- # This method never mutates the original URL string.
- # Returns the updated URL string or nil if input is blank.
- def to_export(url)
- return nil if url.blank?
-
- uri = URI.parse(url)
- path = uri.path.to_s
-
- if uri.host&.include?('indico')
- ensure_events_ics_path(uri)
- elsif path.match?(%r{/(event|events)\.ics\z})
- uri.to_s
- else
- ensure_ical_query(uri)
- end
- end
-
- # Ensures the Indico URL ends with '/events.ics'
- def ensure_events_ics_path(uri)
- if uri.path&.include?('event')
- uri.path = File.join(uri.path, 'event.ics') unless uri.path.end_with?('/event.ics')
- elsif uri.path&.include?('category')
- uri.path = File.join(uri.path, 'events.ics') unless uri.path.end_with?('/events.ics')
- end
- uri.to_s
- end
-
- # Ensures the URL has '?ical=true' in its query params
- def ensure_ical_query(uri)
- query = URI.decode_www_form(uri.query.to_s).to_h
- query['ical'] = 'true' unless query['ical'] == 'true'
- uri.query = URI.encode_www_form(query)
- uri.to_s
- end
- end
- end
-end
diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb
index 0621b94f1..bb2b36ce4 100644
--- a/lib/ingestors/ical_ingestor.rb
+++ b/lib/ingestors/ical_ingestor.rb
@@ -9,7 +9,6 @@ module Ingestors
# Reads from direct ical / .ics / Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps.
class IcalIngestor < Ingestor
include Ingestors::Concerns::SitemapHelpers
- include Ingestors::Concerns::IcalIngestorExportUrl
def self.config
{
@@ -37,6 +36,8 @@ def read(source_url)
def process_url(url)
export_url = to_export(url)
events = Icalendar::Event.parse(open_url(export_url, raise: true).set_encoding('utf-8'))
+ raise 'Not found' if events.nil? || events.empty?
+
events.each do |e|
process_calevent(e)
end
@@ -44,6 +45,47 @@ def process_url(url)
@messages << "Process file url[#{export_url}] failed with: #{e.message}"
end
+ # 1. If the path already ends with '/events.ics', return as-is.
+ # 2. If the host includes 'indico', ensures the path ends with '/events.ics'.
+ # 3. Otherwise, append '?ical=true' query param if not already present.
+ #
+ # This method never mutates the original URL string.
+ # Returns the updated URL string or nil if input is blank.
+ def to_export(url)
+ return nil if url.blank?
+
+ uri = URI.parse(url)
+ path = uri.path.to_s
+
+ if path.match?(%r{/(event|events)\.ics\z})
+ uri.to_s
+ elsif uri.host&.include?('indico')
+ ensure_events_ics_path(uri)
+ else
+ ensure_ical_query(uri)
+ end
+ end
+
+ # Ensures the Indico URL ends with '/events.ics'
+ def ensure_events_ics_path(uri)
+ paths = uri.path.split('/')
+ uri.path = "#{paths[0..2].join('/')}/"
+ if paths[1] == 'event'
+ uri.path = File.join(uri.path, 'event.ics')
+ elsif paths[1] == 'category'
+ uri.path = File.join(uri.path, 'events.ics')
+ end
+ uri.to_s
+ end
+
+ # Ensures the URL has '?ical=true' in its query params
+ def ensure_ical_query(uri)
+ query = URI.decode_www_form(uri.query.to_s).to_h
+ query['ical'] = 'true' unless query['ical'] == 'true'
+ uri.query = URI.encode_www_form(query)
+ uri.to_s
+ end
+
# Builds the OpenStruct event and adds it in event.
def process_calevent(calevent)
event_to_add = OpenStruct.new.tap do |event|
@@ -60,8 +102,9 @@ def process_calevent(calevent)
def assign_basic_info(event, calevent)
event.url = calevent.url.to_s
event.title = calevent.summary.to_s
- event.description = process_description calevent.description
+ event.description = calevent.description.to_s
event.keywords = process_keywords(calevent.categories)
+ event.contact = calevent.contact.join(', ')
end
# Assigns to event: start, end, timezone.
@@ -73,21 +116,13 @@ def assign_time_info(event, calevent)
# Assigns to event: venue, online, city.
def assign_location_info(event, location)
- return if location.blank? || !location.present?
+ return if location.blank?
event.venue = location.to_s
event.online = location.downcase.include?('online')
event.city, event.postcode, event.country = process_location(location)
end
- # Removes all `
` tags and converts HTML to MD.
- def process_description(input)
- return input if input.nil?
-
- desc = input.to_s.gsub('', '
')
- convert_description(desc)
- end
-
# Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field.
# Handles whether tzid shows up as an Array or a single string
def get_tzid(dtstart)
@@ -102,17 +137,9 @@ def get_tzid(dtstart)
# Returns an array of 3 location characteristics: suburb, postcode, country
# Everything is nil if location.blank or location is online
def process_location(location)
- return [nil, nil, nil] if location.blank?
+ return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array)
- if location.to_s.downcase.include?('online')
- [nil, nil, nil]
- else
- [
- location['suburb'],
- location['postcode'],
- location['country']
- ]
- end
+ [nil, nil, nil]
end
# Returns keywords from the `CATEGORIES` ICal field
diff --git a/test/unit/ingestors/ical_ingestor_test.rb b/test/unit/ingestors/ical_ingestor_test.rb
index 8a8aaae4c..17ebc4cf0 100644
--- a/test/unit/ingestors/ical_ingestor_test.rb
+++ b/test/unit/ingestors/ical_ingestor_test.rb
@@ -192,12 +192,10 @@ class IcalIngestorTest < ActiveSupport::TestCase
calevent = Object.new # fake calevent
# Stub a method that will raise an error
- def ingestor.assign_basic_info(*)
- raise StandardError, 'test failure'
+ ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do
+ ingestor.send(:process_calevent, calevent)
end
- ingestor.send(:process_calevent, calevent)
-
assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure'
end
From f3f16b97e0fb14890e94aebc5c6e53c3d9046727 Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Mon, 27 Oct 2025 15:55:51 +0100
Subject: [PATCH 03/10] feat(indico-ingestor): API token auth
---
config/secrets.example.yml | 1 +
lib/ingestors/ical_ingestor.rb | 4 +++-
lib/ingestors/ingestor.rb | 3 ++-
3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/config/secrets.example.yml b/config/secrets.example.yml
index 018b0439e..718f5ad0f 100644
--- a/config/secrets.example.yml
+++ b/config/secrets.example.yml
@@ -38,6 +38,7 @@ external_api_keys: &external_api_keys
password:
gpt_api_key:
willma_api_key:
+ indico_api_token: # begins by 'indp_', cf. https://docs.getindico.io/en/stable/http-api/access/#api-token-authentication
#Internal config
development:
diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb
index bb2b36ce4..647af15c1 100644
--- a/lib/ingestors/ical_ingestor.rb
+++ b/lib/ingestors/ical_ingestor.rb
@@ -19,6 +19,7 @@ def self.config
end
def read(source_url)
+ @token << Rails.application.config.secrets.indico_api_token
@verbose = false
sources = get_sources(source_url)
return if sources.nil?
@@ -35,7 +36,8 @@ def read(source_url)
# Note: One .ics file can have multiple Ical events.
def process_url(url)
export_url = to_export(url)
- events = Icalendar::Event.parse(open_url(export_url, raise: true).set_encoding('utf-8'))
+ content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8')
+ events = Icalendar::Event.parse(content)
raise 'Not found' if events.nil? || events.empty?
events.each do |e|
diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb
index 8f043f135..e552f16fe 100644
--- a/lib/ingestors/ingestor.rb
+++ b/lib/ingestors/ingestor.rb
@@ -50,13 +50,14 @@ def stats_summary(type)
summary
end
- def open_url(url, raise: false)
+ def open_url(url, token: '', raise: false)
options = {
redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection
read_timeout: 5
}
options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode)
redirect_attempts = 5
+ options['Authorization'] = "Bearer #{token}" if token
begin
URI(url).open(options)
rescue OpenURI::HTTPRedirect => e
From 529009ba830d2a4272bac70ae9677494c70e6565 Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Mon, 27 Oct 2025 16:33:37 +0100
Subject: [PATCH 04/10] fixtests(indico-ingestor): typo
---
lib/ingestors/ical_ingestor.rb | 2 +-
lib/ingestors/ingestor.rb | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb
index 647af15c1..09fdb8ae6 100644
--- a/lib/ingestors/ical_ingestor.rb
+++ b/lib/ingestors/ical_ingestor.rb
@@ -19,7 +19,7 @@ def self.config
end
def read(source_url)
- @token << Rails.application.config.secrets.indico_api_token
+ @token = Rails.application.config.secrets.indico_api_token
@verbose = false
sources = get_sources(source_url)
return if sources.nil?
diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb
index e552f16fe..a662985cf 100644
--- a/lib/ingestors/ingestor.rb
+++ b/lib/ingestors/ingestor.rb
@@ -50,14 +50,14 @@ def stats_summary(type)
summary
end
- def open_url(url, token: '', raise: false)
+ def open_url(url, raise: false, token: nil)
options = {
redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection
read_timeout: 5
}
options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode)
redirect_attempts = 5
- options['Authorization'] = "Bearer #{token}" if token
+ options['Authorization'] = "Bearer #{token}" unless token.nil?
begin
URI(url).open(options)
rescue OpenURI::HTTPRedirect => e
From 5b9ab3f676d848e07cb636c76de5c2d2508812a0 Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Fri, 21 Nov 2025 09:52:41 +0100
Subject: [PATCH 05/10] revert(ical_ingestor): to before the indico changes
---
lib/ingestors/ical_ingestor.rb | 212 ++++++++++------------
test/unit/ingestors/ical_ingestor_test.rb | 52 +-----
2 files changed, 95 insertions(+), 169 deletions(-)
diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb
index 09fdb8ae6..e91fdecfa 100644
--- a/lib/ingestors/ical_ingestor.rb
+++ b/lib/ingestors/ical_ingestor.rb
@@ -1,154 +1,130 @@
-# frozen_string_literal: true
-
require 'icalendar'
require 'nokogiri'
require 'open-uri'
require 'tzinfo'
module Ingestors
- # Reads from direct ical / .ics / Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps.
class IcalIngestor < Ingestor
- include Ingestors::Concerns::SitemapHelpers
-
def self.config
{
key: 'ical',
- title: 'iCalendar / Indico / .ics File',
+ title: 'iCalendar',
category: :events
}
end
- def read(source_url)
- @token = Rails.application.config.secrets.indico_api_token
- @verbose = false
- sources = get_sources(source_url)
- return if sources.nil?
-
- sources.each do |url|
- process_url(url)
+ def read(url)
+ unless url.nil?
+ if url.to_s.downcase.end_with? 'sitemap.xml'
+ process_sitemap url
+ else
+ process_icalendar url
+ end
end
end
private
- # Modifies the given URL to the ics or ical export.
- # Loops into each Ical event to process it.
- # Note: One .ics file can have multiple Ical events.
- def process_url(url)
- export_url = to_export(url)
- content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8')
- events = Icalendar::Event.parse(content)
- raise 'Not found' if events.nil? || events.empty?
-
- events.each do |e|
- process_calevent(e)
- end
- rescue StandardError => e
- @messages << "Process file url[#{export_url}] failed with: #{e.message}"
- end
-
- # 1. If the path already ends with '/events.ics', return as-is.
- # 2. If the host includes 'indico', ensures the path ends with '/events.ics'.
- # 3. Otherwise, append '?ical=true' query param if not already present.
- #
- # This method never mutates the original URL string.
- # Returns the updated URL string or nil if input is blank.
- def to_export(url)
- return nil if url.blank?
-
- uri = URI.parse(url)
- path = uri.path.to_s
-
- if path.match?(%r{/(event|events)\.ics\z})
- uri.to_s
- elsif uri.host&.include?('indico')
- ensure_events_ics_path(uri)
- else
- ensure_ical_query(uri)
- end
- end
-
- # Ensures the Indico URL ends with '/events.ics'
- def ensure_events_ics_path(uri)
- paths = uri.path.split('/')
- uri.path = "#{paths[0..2].join('/')}/"
- if paths[1] == 'event'
- uri.path = File.join(uri.path, 'event.ics')
- elsif paths[1] == 'category'
- uri.path = File.join(uri.path, 'events.ics')
- end
- uri.to_s
- end
-
- # Ensures the URL has '?ical=true' in its query params
- def ensure_ical_query(uri)
- query = URI.decode_www_form(uri.query.to_s).to_h
- query['ical'] = 'true' unless query['ical'] == 'true'
- uri.query = URI.encode_www_form(query)
- uri.to_s
- end
-
- # Builds the OpenStruct event and adds it in event.
- def process_calevent(calevent)
- event_to_add = OpenStruct.new.tap do |event|
- assign_basic_info(event, calevent)
- assign_time_info(event, calevent)
- assign_location_info(event, calevent.location)
+ def process_sitemap(url)
+ # find urls for individual icalendar files
+ begin
+ sitemap = Nokogiri::XML.parse(open_url(url, raise: true))
+ locs = sitemap.xpath('/ns:urlset/ns:url/ns:loc', {
+ 'ns' => 'http://www.sitemaps.org/schemas/sitemap/0.9'
+ })
+ locs.each do |loc|
+ process_icalendar(loc.text)
+ end
+ rescue Exception => e
+ @messages << "Extract from sitemap[#{url}] failed with: #{e.message}"
end
- add_event(event_to_add)
- rescue StandardError => e
- @messages << "Process iCalendar failed with: #{e.message}"
- end
- # Assigns to event: url, title, description, keywords.
- def assign_basic_info(event, calevent)
- event.url = calevent.url.to_s
- event.title = calevent.summary.to_s
- event.description = calevent.description.to_s
- event.keywords = process_keywords(calevent.categories)
- event.contact = calevent.contact.join(', ')
+ # finished
+ nil
end
- # Assigns to event: start, end, timezone.
- def assign_time_info(event, calevent)
- event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil?
- event.end = calevent.dtend&.to_time unless calevent.dtend.nil?
- event.timezone = get_tzid(calevent.dtstart)
- end
-
- # Assigns to event: venue, online, city.
- def assign_location_info(event, location)
- return if location.blank?
+ def process_icalendar(url)
+ # process individual ics file
+ query = '?ical=true'
- event.venue = location.to_s
- event.online = location.downcase.include?('online')
- event.city, event.postcode, event.country = process_location(location)
- end
+ begin
+ # append query (if required)
+ file_url = url
+ file_url << query unless url.to_s.downcase.ends_with? query
- # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field.
- # Handles whether tzid shows up as an Array or a single string
- def get_tzid(dtstart)
- return nil unless dtstart.respond_to?(:ical_params)
+ # process file
+ events = Icalendar::Event.parse(open_url(file_url, raise: true).set_encoding('utf-8'))
- tzid = dtstart.ical_params['tzid']
- return nil if tzid.nil?
+ # process each event
+ events.each do |e|
+ process_event(e)
+ end
+ rescue Exception => e
+ @messages << "Process file url[#{file_url}] failed with: #{e.message}"
+ end
- tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s
+ # finished
+ nil
end
- # Returns an array of 3 location characteristics: suburb, postcode, country
- # Everything is nil if location.blank or location is online
- def process_location(location)
- return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array)
+ def process_event(calevent)
+ # puts "calevent: #{calevent.inspect}"
+ begin
+ # set fields
+ event = OpenStruct.new
+ event.url = calevent.url.to_s
+ event.title = calevent.summary.to_s
+ event.description = process_description calevent.description
+
+ # puts "\n\ncalevent.description = #{calevent.description}"
+ # puts "\n\n... converted = #{event.description}"
+
+ event.end = calevent.dtend&.to_time
+ unless calevent.dtstart.nil?
+ dtstart = calevent.dtstart
+ event.start = dtstart&.to_time
+ tzid = dtstart.ical_params['tzid']
+ event.timezone = tzid.first.to_s if !tzid.nil? and tzid.size > 0
+ end
+
+ event.venue = calevent.location.to_s
+ if calevent.location.downcase.include?('online')
+ event.online = true
+ event.city = nil
+ event.postcode = nil
+ event.country = nil
+ else
+ location = convert_location(calevent.location)
+ event.city = location['suburb'] unless location['suburb'].nil?
+ event.country = location['country'] unless location['country'].nil?
+ event.postcode = location['postcode'] unless location['postcode'].nil?
+ end
+ event.keywords = []
+ unless calevent.categories.nil? or calevent.categories.first.nil?
+ cats = calevent.categories.first
+ if cats.is_a?(Icalendar::Values::Array)
+ cats.each do |item|
+ event.keywords << item.to_s.lstrip
+ end
+ else
+ event.keywords << cats.to_s.strip
+ end
+ end
+
+ # store event
+ @events << event
+ rescue Exception => e
+ @messages << "Process iCalendar failed with: #{e.message}"
+ end
- [nil, nil, nil]
+ # finished
+ nil
end
- # Returns keywords from the `CATEGORIES` ICal field
- def process_keywords(categories)
- return [] if categories.blank?
+ def process_description(input)
+ return input if input.nil?
- categories.flatten.compact.map { |cat| cat.to_s.strip }
+ convert_description(input.to_s.gsub(/\R/, '
'))
end
end
end
diff --git a/test/unit/ingestors/ical_ingestor_test.rb b/test/unit/ingestors/ical_ingestor_test.rb
index 17ebc4cf0..cb6ff5a54 100644
--- a/test/unit/ingestors/ical_ingestor_test.rb
+++ b/test/unit/ingestors/ical_ingestor_test.rb
@@ -26,7 +26,7 @@ class IcalIngestorTest < ActiveSupport::TestCase
assert ingestor.events.empty?
assert ingestor.materials.empty?
- assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:'
+ assert_includes ingestor.messages, 'Extract from sitemap[https://missing.org/sitemap.xml] failed with: 404 '
end
test 'ingest valid sitemap' do
@@ -187,56 +187,6 @@ class IcalIngestorTest < ActiveSupport::TestCase
end
end
- test 'process_calevent logs error when exception is raised' do
- ingestor = Ingestors::IcalIngestor.new
- calevent = Object.new # fake calevent
-
- # Stub a method that will raise an error
- ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do
- ingestor.send(:process_calevent, calevent)
- end
-
- assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure'
- end
-
- test 'to_export method' do
- ingestor = Ingestors::IcalIngestor.new
- indico_url_event = 'https://indico.cern.ch/event/1588342/'
- indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular
- indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone'
- indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone'
- indico_url_category = 'https://indico.cern.ch/category/19377/'
- indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural
- indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d'
- indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d'
- url_with_ics = 'https://mywebsite.com/event/blabla/events.ics'
- url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born'
- url_no_ical = 'https://mywebsite.com/event/blabla'
- url_with_ical = 'https://mywebsite.com/event/blabla?ical=true'
-
- # When indico link – event
- assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics
- assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics
-
- # When indico link – category
- assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics
- assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics
-
- # When non-indico link
- assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same
- assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same
-
- # When indico link which already has the /events.ics
- assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is
- assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is
-
- # When other url, adds the ical query param
- assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical
-
- # When other url with ical query param, keep it as-is
- assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical
- end
-
private
def check_event_exists(title, url)
From af243e3e493e365e1ddae4da74e4f516dd29fa08 Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Tue, 2 Dec 2025 11:54:00 +0100
Subject: [PATCH 06/10] review(#1161): Indico ingestor as a separate class
---
lib/ingestors/indico_ingestor.rb | 167 +++++++++++++
lib/ingestors/ingestor_factory.rb | 1 +
test/unit/ingestors/indico_ingestor_test.rb | 248 ++++++++++++++++++++
3 files changed, 416 insertions(+)
create mode 100644 lib/ingestors/indico_ingestor.rb
create mode 100644 test/unit/ingestors/indico_ingestor_test.rb
diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb
new file mode 100644
index 000000000..2ee1cd1a3
--- /dev/null
+++ b/lib/ingestors/indico_ingestor.rb
@@ -0,0 +1,167 @@
+# frozen_string_literal: true
+
+require 'icalendar'
+require 'nokogiri'
+require 'open-uri'
+require 'tzinfo'
+
+module Ingestors
+ # Reads from direct .ics or Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps.
+ class IndicoIngestor < Ingestor
+ include Ingestors::Concerns::SitemapHelpers
+
+ def self.config
+ {
+ key: 'indico',
+ title: 'Indico / .ics file',
+ category: :events
+ }
+ end
+
+ def read(source_url)
+ @token = Rails.application.config.secrets.indico_api_token
+ @verbose = false
+ sources = get_sources(source_url)
+ return if sources.nil?
+
+ sources.each do |url|
+ process_url(url)
+ end
+ end
+
+ private
+
+ # Modifies the given URL to the ics export.
+ # Loops into each Ical event to process it.
+ # Note: One .ics file can have multiple Ical events.
+ def process_url(url)
+ export_url = to_export(url)
+ raise 'Not an indico link' if export_url.nil?
+
+ content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8')
+ events = Icalendar::Event.parse(content)
+ raise 'Not found' if events.nil? || events.empty?
+
+ events.each do |e|
+ process_calevent(e)
+ end
+ rescue StandardError => e
+ @messages << "Process file url[#{export_url}] failed with: #{e.message}"
+ end
+
+ # 1. If the path already ends with '/events.ics', return as-is.
+ # 2. If the host includes 'indico', ensures the path ends with '/events.ics'.
+ # 3. Otherwise, append '?ical=true' query param if not already present.
+ #
+ # This method never mutates the original URL string.
+ # Returns the updated URL string or nil if input is blank.
+ def to_export(url)
+ return nil if url.blank?
+
+ uri = URI.parse(url)
+ path = uri.path.to_s
+
+ if path.match?(%r{/(event|events)\.ics\z})
+ uri.to_s
+ elsif indico_page?(uri)
+ ensure_events_ics_path(uri)
+ else
+ nil
+ end
+ end
+
+ def indico_page?(uri)
+ # Either checks in host, e.g., 'indico.myinstitution.com'
+ return true if uri.host&.include?('indico')
+
+ # Or checks in meta tags
+ html = open_url(uri, raise: true)
+ doc = Nokogiri::HTML(html)
+ content = doc.at('meta[property="og:site_name"]')&.[]('content')
+ content&.match?(/indico/i)
+ end
+
+ # Ensures the Indico URL ends with '/events.ics'
+ def ensure_events_ics_path(uri)
+ paths = uri.path.split('/')
+ uri.path = "#{paths[0..2].join('/')}/"
+ if paths[1] == 'event'
+ uri.path = File.join(uri.path, 'event.ics')
+ elsif paths[1] == 'category'
+ uri.path = File.join(uri.path, 'events.ics')
+ end
+ uri.to_s
+ end
+
+ # Ensures the URL has '?ical=true' in its query params
+ def ensure_ical_query(uri)
+ query = URI.decode_www_form(uri.query.to_s).to_h
+ query['ical'] = 'true' unless query['ical'] == 'true'
+ uri.query = URI.encode_www_form(query)
+ uri.to_s
+ end
+
+ # Builds the OpenStruct event and adds it in event.
+ def process_calevent(calevent)
+ event_to_add = OpenStruct.new.tap do |event|
+ assign_basic_info(event, calevent)
+ assign_time_info(event, calevent)
+ assign_location_info(event, calevent.location)
+ end
+ add_event(event_to_add)
+ rescue StandardError => e
+ @messages << "Process iCalendar failed with: #{e.message}"
+ end
+
+ # Assigns to event: url, title, description, keywords.
+ def assign_basic_info(event, calevent)
+ event.url = calevent.url.to_s
+ event.title = calevent.summary.to_s
+ event.description = calevent.description.to_s
+ event.keywords = process_keywords(calevent.categories)
+ event.contact = calevent.contact.join(', ')
+ end
+
+ # Assigns to event: start, end, timezone.
+ def assign_time_info(event, calevent)
+ event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil?
+ event.end = calevent.dtend&.to_time unless calevent.dtend.nil?
+ event.timezone = get_tzid(calevent.dtstart)
+ end
+
+ # Assigns to event: venue, online, city.
+ def assign_location_info(event, location)
+ return if location.blank?
+
+ event.venue = location.to_s
+ event.online = location.downcase.include?('online')
+ event.city, event.postcode, event.country = process_location(location)
+ end
+
+ # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field.
+ # Handles whether tzid shows up as an Array or a single string
+ def get_tzid(dtstart)
+ return nil unless dtstart.respond_to?(:ical_params)
+
+ tzid = dtstart.ical_params['tzid']
+ return nil if tzid.nil?
+
+ tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s
+ end
+
+ # Returns an array of 3 location characteristics: suburb, postcode, country
+ # Everything is nil if location.blank or location is online
+ def process_location(location)
+ return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array)
+
+ [nil, nil, nil]
+ end
+
+ # Returns keywords from the `CATEGORIES` ICal field
+ def process_keywords(categories)
+ return [] if categories.blank?
+
+ categories.flatten.compact.map { |cat| cat.to_s.strip }
+ end
+ end
+end
diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb
index 67e818d02..b2cafda9f 100644
--- a/lib/ingestors/ingestor_factory.rb
+++ b/lib/ingestors/ingestor_factory.rb
@@ -6,6 +6,7 @@ def self.ingestors
Ingestors::EventbriteIngestor,
Ingestors::EventCsvIngestor,
Ingestors::IcalIngestor,
+ Ingestors::IndicoIngestor,
Ingestors::LibcalIngestor,
Ingestors::MaterialCsvIngestor,
Ingestors::TessEventIngestor,
diff --git a/test/unit/ingestors/indico_ingestor_test.rb b/test/unit/ingestors/indico_ingestor_test.rb
new file mode 100644
index 000000000..eaf19191a
--- /dev/null
+++ b/test/unit/ingestors/indico_ingestor_test.rb
@@ -0,0 +1,248 @@
+require 'test_helper'
+
+class IndicoIngestorTest < ActiveSupport::TestCase
+ setup do
+ @user = users(:regular_user)
+ @content_provider = content_providers(:another_portal_provider)
+ mock_ingestions
+ # mock_nominatim
+ mock_timezone # System time zone should not affect test result
+ end
+
+ teardown do
+ reset_timezone
+ end
+
+ test 'sitemap not found' do
+ source = @content_provider.sources.build(url: 'https://missing.org/sitemap.xml',
+ method: 'ical',
+ enabled: true)
+ ingestor = Ingestors::IcalIngestor.new
+
+ assert_no_difference('Event.count') do
+ ingestor.read(source.url)
+ ingestor.write(@user, @content_provider)
+ end
+
+ assert ingestor.events.empty?
+ assert ingestor.materials.empty?
+ assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:'
+ end
+
+ test 'ingest valid sitemap' do
+ source = @content_provider.sources.build(url: 'https://app.com/events/sitemap.xml',
+ method: 'ical',
+ enabled: true)
+ ingestor = Ingestors::IcalIngestor.new
+
+ # check two events to be updated
+ name = 'ical_event_1'
+ event = events(:ical_event_1)
+ refute event.nil?, "event[#{name}] not found"
+ refute event.online?, "event[#{name}] online not matched"
+ assert_equal 'Another Portal Provider', event.content_provider.title,
+ "event[#{name}] content provider not matched"
+
+ name = 'ical_event_2'
+ refute events(name).nil?, "fixture[#{name}] not found"
+ title = 'PaCER Seminar: Computational Fluid Dynamics'
+ url = 'https://pawsey.org.au/event/pacer-seminar-computational-fluid-dynamics/'
+ event = check_event_exists title, url
+ refute event.nil?, "event title[#{title}] not found"
+ refute event.online?, "event title[#{title}] online not matched"
+ assert_equal 'Another Portal Provider', event.content_provider.title,
+ "event title[#{title}] content provider not matched"
+
+ assert_difference('Event.count', 4) do
+ freeze_time(2019) do
+ ingestor.read(source.url)
+ ingestor.write(@user, @content_provider)
+ end
+ end
+
+ assert_equal 8, ingestor.events.count
+ assert ingestor.materials.empty?
+ assert_equal 4, ingestor.stats[:events][:added]
+ assert_equal 2, ingestor.stats[:events][:updated]
+ assert_equal 2, ingestor.stats[:events][:rejected]
+
+ # check individual events
+ # check not found
+ assert_includes ingestor.messages, "Process file url\[https://pawsey.org.au/events/\?ical=true\] failed with: 404 "
+
+ # check rejected
+ event = ingestor.events.detect { |e| e.title == 'NVIDIA cuQuantum Session' }
+ assert event
+ assert event.errors.added?(:url, :url, value: '123')
+ event = ingestor.events.detect { |e| e.title == 'PaCER Seminar: Radio astronomy' }
+ assert event
+ assert event.errors.added?(:url, :blank)
+
+ # check added
+ title = 'Ask Me Anything: Porous media visualisation and LBPM'
+ event = check_event_exists title, 'https://pawsey.org.au/event/ask-me-anything-porous-media-visualisation-and-lbpm/'
+ assert event.online?, "event title[#{event.title}] online not matched"
+ assert (!event.keywords.nil? and event.keywords.size == 2), "event title[#{event.title}] keywords.size not matched"
+ assert event.keywords.include?('AMA'), "event title[#{event.title}] keyword[AMA] not found"
+ assert event.keywords.include?('Visualisation'), "event title[#{event.title}] keyword[Visualisation] not found"
+
+ title = 'Pawsey Intern Showcase 2022'
+ event = check_event_exists title, 'https://pawsey.org.au/event/pawsey-intern-showcase-2022/'
+ assert_includes event.description, 'The Pawsey Supercomputing Research Centre takes prides in its Summer Internship Program'
+ assert_includes event.description, 'range of trainings we immerse students in during Week 1 of the Program (and throughout).'
+ assert_equal 'Perth', event.timezone.to_s, "event title[#{event.title}] timezone not matched"
+ assert_equal '2022-02-11 01:45:00 UTC', event.start.utc.to_s, "event title[#{event.title}] start not matched"
+ assert_equal '2022-02-11 04:50:00 UTC', event.end.utc.to_s, "event title[#{event.title}] end not matched"
+
+ title = 'P\'Con - Experience with porting and scaling codes on AMD GPUs'
+ event = check_event_exists title, 'https://pawsey.org.au/event/experience-with-porting-and-scaling-codes-on-amd-gpus/'
+ assert event.online?, "event title[#{title}] online not matched"
+
+ title = 'Overview of High Performance Computing Resources at OLCF'
+ event = check_event_exists title, 'https://pawsey.org.au/event/overview-of-high-performance-computing-resources-at-olcf/'
+ refute event.online?, "event title[#{title}] online not matched"
+ location = 'Pawsey Supercomputing Centre, 1 Bryce Avenue, Kensington, Western Australia, 6151, Australia'
+ assert_equal location, event.venue, "event title[#{title}] venue not matched"
+ # Geocoding is disabled so these fail TODO: Re-enable, but using cache + rate limiting
+ # assert_equal 'Kensington', event.city, "event title[#{title}] city not matched"
+ # assert_equal '6151', event.postcode, "event title[#{title}] postcode not matched"
+ # assert_equal 'Australia', event.country, "event title[#{title}] country not matched"
+
+ # TODO: check updated
+ title = 'PaCER Seminar: Computational Fluid Dynamics'
+ event = check_event_exists title, 'https://pawsey.org.au/event/pacer-seminar-computational-fluid-dynamics/'
+ assert_equal '2022-06-15 03:00:00 UTC', event.end.utc.to_s, "event title[#{event.title}] updated end not matched"
+ assert event.description != 'MyText', "event title[#{event.title}] description not updated"
+ assert event.description.size > 100, "event title[#{event.title}] description too short"
+ assert event.online?, "event title[#{event.title}] online not matched"
+ assert_equal 2, event.keywords.size, "event title[#{event.title}] keywords size not matched"
+ %w[Supercomputing Seminar].each do |keyword|
+ assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found"
+ end
+ assert_equal 'Online, Virtual, Australia', event.venue, "event title[#{event.title}] venue not matched"
+ assert event.city.nil?, "event title[#{event.title}] city not matched"
+ assert event.postcode.nil?, "event title[#{event.title}] postcode not matched"
+ assert event.country.nil?, "event title[#{event.title}] country not matched"
+
+ title = "P'Con - Embracing new solutions for in-situ visualisation"
+ event = check_event_exists title, 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/'
+ assert event.online?, "event title[#{event.title}] online not matched"
+ assert_equal 3, event.keywords.size, "event title[#{event.title}] keywords size not matched"
+ %w[Supercomputing Conference Visualisation].each do |keyword|
+ assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found"
+ end
+ assert_equal 'Online, Virtual, Australia', event.venue, "event title[#{event.title}] venue not matched"
+ assert event.postcode.nil?, "event title[#{event.title}] postcode not matched"
+ assert event.city.nil?, "event title[#{event.title}] city not matched"
+ assert event.country.nil?, "event title[#{event.title}] country not matched"
+ end
+
+ test 'check single ical sources' do
+ # override time
+ assert_no_difference 'Event.count' do
+ freeze_time(2019) do
+ ingestor = Ingestors::IcalIngestor.new
+ source = @content_provider.sources.build(
+ url: 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/?ical=true',
+ method: 'ical', enabled: true
+ )
+
+ ingestor.read(source.url)
+ ingestor.write(@user, @content_provider)
+
+ assert_equal 1, ingestor.events.count
+ assert ingestor.materials.empty?
+ assert_equal 0, ingestor.stats[:events][:added]
+ assert_equal 1, ingestor.stats[:events][:updated]
+ assert_equal 0, ingestor.stats[:events][:rejected]
+
+ ingestor = Ingestors::IcalIngestor.new
+ source = @content_provider.sources.build(
+ url: 'https://pawsey.org.au/event/pawsey-intern-showcase-2021/?ical=true',
+ method: 'ical', enabled: true
+ )
+
+ ingestor.read(source.url)
+ ingestor.write(@user, @content_provider)
+
+ assert_equal 1, ingestor.events.count
+ assert ingestor.materials.empty?
+ assert_equal 0, ingestor.stats[:events][:added]
+ assert_equal 0, ingestor.stats[:events][:updated]
+ assert_equal 1, ingestor.stats[:events][:rejected]
+
+ event = ingestor.events.detect { |e| e.title == 'Pawsey Intern Showcase 2021' }
+ assert event
+ assert event.errors.added?(:url, :blank)
+ end
+ end
+
+ # get updated
+ title = 'P\'Con - Embracing new solutions for in-situ visualisation'
+ url = 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/'
+ event = check_event_exists title, url
+ assert_equal 3, event.keywords.size
+ %w[Supercomputing Conference Visualisation].each do |keyword|
+ assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found"
+ end
+ end
+
+ test 'process_calevent logs error when exception is raised' do
+ ingestor = Ingestors::IcalIngestor.new
+ calevent = Object.new # fake calevent
+
+ # Stub a method that will raise an error
+ ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do
+ ingestor.send(:process_calevent, calevent)
+ end
+
+ assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure'
+ end
+
+ test 'to_export method' do
+ ingestor = Ingestors::IcalIngestor.new
+ indico_url_event = 'https://indico.cern.ch/event/1588342/'
+ indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular
+ indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone'
+ indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone'
+ indico_url_category = 'https://indico.cern.ch/category/19377/'
+ indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural
+ indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d'
+ indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d'
+ url_with_ics = 'https://mywebsite.com/event/blabla/events.ics'
+ url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born'
+ url_no_ical = 'https://mywebsite.com/event/blabla'
+ url_with_ical = 'https://mywebsite.com/event/blabla?ical=true'
+
+ # When indico link – event
+ assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics
+ assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics
+
+ # When indico link – category
+ assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics
+ assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics
+
+ # When non-indico link
+ assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same
+ assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same
+
+ # When indico link which already has the /events.ics
+ assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is
+ assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is
+
+ # When other url, adds the ical query param
+ assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical
+
+ # When other url with ical query param, keep it as-is
+ assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical
+ end
+
+ private
+
+ def check_event_exists(title, url)
+ events = Event.where(title: title, url: url)
+ assert (!events.nil? and events.size > 0), "event title[#{title}] not found"
+ assert events.size < 2, "event[#{title}] duplicates found = #{events.size}"
+ events.first
+ end
+end
From 19fe8c24e2313ce0d87e34bd5600e8c9ef71a7fe Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:09:26 +0100
Subject: [PATCH 07/10] chore(indico_ingestor): removed references to ical
---
lib/ingestors/indico_ingestor.rb | 20 ++++----------------
1 file changed, 4 insertions(+), 16 deletions(-)
diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb
index 2ee1cd1a3..0a4363689 100644
--- a/lib/ingestors/indico_ingestor.rb
+++ b/lib/ingestors/indico_ingestor.rb
@@ -32,13 +32,13 @@ def read(source_url)
private
# Modifies the given URL to the ics export.
- # Loops into each Ical event to process it.
- # Note: One .ics file can have multiple Ical events.
+ # Loops into each event to process it.
+ # Note: One .ics file can have multiple events.
def process_url(url)
export_url = to_export(url)
raise 'Not an indico link' if export_url.nil?
- content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8')
+ content = open_url(export_url, raise: true, token: @token).set_encoding('utf-8')
events = Icalendar::Event.parse(content)
raise 'Not found' if events.nil? || events.empty?
@@ -51,8 +51,6 @@ def process_url(url)
# 1. If the path already ends with '/events.ics', return as-is.
# 2. If the host includes 'indico', ensures the path ends with '/events.ics'.
- # 3. Otherwise, append '?ical=true' query param if not already present.
- #
# This method never mutates the original URL string.
# Returns the updated URL string or nil if input is blank.
def to_export(url)
@@ -65,8 +63,6 @@ def to_export(url)
uri.to_s
elsif indico_page?(uri)
ensure_events_ics_path(uri)
- else
- nil
end
end
@@ -93,14 +89,6 @@ def ensure_events_ics_path(uri)
uri.to_s
end
- # Ensures the URL has '?ical=true' in its query params
- def ensure_ical_query(uri)
- query = URI.decode_www_form(uri.query.to_s).to_h
- query['ical'] = 'true' unless query['ical'] == 'true'
- uri.query = URI.encode_www_form(query)
- uri.to_s
- end
-
# Builds the OpenStruct event and adds it in event.
def process_calevent(calevent)
event_to_add = OpenStruct.new.tap do |event|
@@ -110,7 +98,7 @@ def process_calevent(calevent)
end
add_event(event_to_add)
rescue StandardError => e
- @messages << "Process iCalendar failed with: #{e.message}"
+ @messages << "process_calevent failed with: #{e.message}"
end
# Assigns to event: url, title, description, keywords.
From 5263990f3caa86f1d293f61a8d96c39a5e3072dc Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Thu, 4 Dec 2025 11:08:01 +0100
Subject: [PATCH 08/10] fix(indico_ingestor): presence, stderr
---
app/controllers/events_controller.rb | 2 +-
lib/ingestors/indico_ingestor.rb | 10 ++++++----
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb
index 86ec50080..67a2ac962 100644
--- a/app/controllers/events_controller.rb
+++ b/app/controllers/events_controller.rb
@@ -236,7 +236,7 @@ def event_params
:timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] },
{ node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible,
{ host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives,
- :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language,
+ :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, :presence,
external_resources_attributes: %i[id url title _destroy],
external_resources: %i[url title], material_ids: [],
llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy],
diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb
index 0a4363689..2e20a2ec1 100644
--- a/lib/ingestors/indico_ingestor.rb
+++ b/lib/ingestors/indico_ingestor.rb
@@ -94,11 +94,11 @@ def process_calevent(calevent)
event_to_add = OpenStruct.new.tap do |event|
assign_basic_info(event, calevent)
assign_time_info(event, calevent)
- assign_location_info(event, calevent.location)
+ assign_location_info(event, calevent)
end
add_event(event_to_add)
rescue StandardError => e
- @messages << "process_calevent failed with: #{e.message}"
+ Rails.logger.error("#{e.class}: #{e.message}")
end
# Assigns to event: url, title, description, keywords.
@@ -118,11 +118,13 @@ def assign_time_info(event, calevent)
end
# Assigns to event: venue, online, city.
- def assign_location_info(event, location)
+ def assign_location_info(event, calevent)
+ location = calevent.location
return if location.blank?
event.venue = location.to_s
- event.online = location.downcase.include?('online')
+ event.online = calevent.description.include?('zoom')
+ event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now
event.city, event.postcode, event.country = process_location(location)
end
From ab659e0f95c8ab542a44e79475e2f3871f7e92d4 Mon Sep 17 00:00:00 2001
From: kennethrioja <59597207+kennethrioja@users.noreply.github.com>
Date: Mon, 8 Dec 2025 17:25:57 +0100
Subject: [PATCH 09/10] test(indico_ingestor): added tests
---
lib/ingestors/indico_ingestor.rb | 18 +-
.../fixtures/files/ingestion/indico/event.ics | 16 +
.../files/ingestion/indico/events.ics | 25 ++
.../files/ingestion/indico/indico.html | 16 +
test/unit/ingestors/indico_ingestor_test.rb | 276 +++++-------------
5 files changed, 131 insertions(+), 220 deletions(-)
create mode 100644 test/fixtures/files/ingestion/indico/event.ics
create mode 100644 test/fixtures/files/ingestion/indico/events.ics
create mode 100644 test/fixtures/files/ingestion/indico/indico.html
diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb
index 2e20a2ec1..f826fd744 100644
--- a/lib/ingestors/indico_ingestor.rb
+++ b/lib/ingestors/indico_ingestor.rb
@@ -106,7 +106,7 @@ def assign_basic_info(event, calevent)
event.url = calevent.url.to_s
event.title = calevent.summary.to_s
event.description = calevent.description.to_s
- event.keywords = process_keywords(calevent.categories)
+ event.keywords = calevent.categories.flatten
event.contact = calevent.contact.join(', ')
end
@@ -125,7 +125,6 @@ def assign_location_info(event, calevent)
event.venue = location.to_s
event.online = calevent.description.include?('zoom')
event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now
- event.city, event.postcode, event.country = process_location(location)
end
# Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field.
@@ -138,20 +137,5 @@ def get_tzid(dtstart)
tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s
end
-
- # Returns an array of 3 location characteristics: suburb, postcode, country
- # Everything is nil if location.blank or location is online
- def process_location(location)
- return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array)
-
- [nil, nil, nil]
- end
-
- # Returns keywords from the `CATEGORIES` ICal field
- def process_keywords(categories)
- return [] if categories.blank?
-
- categories.flatten.compact.map { |cat| cat.to_s.strip }
- end
end
end
diff --git a/test/fixtures/files/ingestion/indico/event.ics b/test/fixtures/files/ingestion/indico/event.ics
new file mode 100644
index 000000000..58728eadf
--- /dev/null
+++ b/test/fixtures/files/ingestion/indico/event.ics
@@ -0,0 +1,16 @@
+BEGIN:VCALENDAR
+VERSION:2.0
+PRODID:-//CERN//INDICO//EN
+BEGIN:VEVENT
+SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials
+DTSTART:20260309T080000Z
+DTEND:20260313T161500Z
+DTSTAMP:20251204T105300Z
+UID:indico-event-1617123@indico.cern.ch
+CONTACT:name.surname@test.com
+DESCRIPTION:speakers and zoom here
+LOCATION:CERN
+URL:https://indico.cern.ch/event/1617123/
+CATEGORIES:TRAINING,EDUCATION
+END:VEVENT
+END:VCALENDAR
\ No newline at end of file
diff --git a/test/fixtures/files/ingestion/indico/events.ics b/test/fixtures/files/ingestion/indico/events.ics
new file mode 100644
index 000000000..7f3bbfd22
--- /dev/null
+++ b/test/fixtures/files/ingestion/indico/events.ics
@@ -0,0 +1,25 @@
+BEGIN:VCALENDAR
+VERSION:2.0
+PRODID:-//CERN//INDICO//EN
+BEGIN:VEVENT
+SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials
+DTSTART:20260309T080000Z
+DTEND:20260313T161500Z
+DTSTAMP:20251203T150800Z
+UID:indico-event-1617123@indico.cern.ch
+CONTACT:name.surname@test.com
+DESCRIPTION:speakers and zoom here
+LOCATION:CERN
+URL:https://indico.cern.ch/event/1617123/
+END:VEVENT
+BEGIN:VEVENT
+SUMMARY:HEP C++ Course and Hands-on Training - Stay Informed
+DTSTART:20991231T225800Z
+DTEND:20991231T225900Z
+DTSTAMP:20251203T150800Z
+UID:indico-event-1211412@indico.cern.ch
+CONTACT:name.surname@test.com
+DESCRIPTION:mockdescription
+URL:https://indico.cern.ch/event/1211412/
+END:VEVENT
+END:VCALENDAR
\ No newline at end of file
diff --git a/test/fixtures/files/ingestion/indico/indico.html b/test/fixtures/files/ingestion/indico/indico.html
new file mode 100644
index 000000000..33765bb38
--- /dev/null
+++ b/test/fixtures/files/ingestion/indico/indico.html
@@ -0,0 +1,16 @@
+
+
+
+