From 979e51eb63d2b86ee256e399eac6afe55ee7522d Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Tue, 14 Oct 2025 10:44:54 +0200 Subject: [PATCH 01/10] feat(indico-ingestor): extended ical ingestor to .ics and Indico --- .../concerns/ical_ingestor_export_url.rb | 49 +++++ lib/ingestors/concerns/sitemap_helpers.rb | 48 +++++ lib/ingestors/ical_ingestor.rb | 183 +++++++++--------- test/unit/ingestors/ical_ingestor_test.rb | 54 +++++- 4 files changed, 239 insertions(+), 95 deletions(-) create mode 100644 lib/ingestors/concerns/ical_ingestor_export_url.rb create mode 100644 lib/ingestors/concerns/sitemap_helpers.rb diff --git a/lib/ingestors/concerns/ical_ingestor_export_url.rb b/lib/ingestors/concerns/ical_ingestor_export_url.rb new file mode 100644 index 000000000..65968fa55 --- /dev/null +++ b/lib/ingestors/concerns/ical_ingestor_export_url.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +module Ingestors + module Concerns + # Gets the proper URL to export an ics or ical + module IcalIngestorExportUrl + private + + # 1. If the host includes 'indico', ensures the path ends with '/events.ics'. + # 2. If the path already ends with '/events.ics', return as-is. + # 3. Otherwise, append '?ical=true' query param if not already present. + # + # This method never mutates the original URL string. + # Returns the updated URL string or nil if input is blank. + def to_export(url) + return nil if url.blank? + + uri = URI.parse(url) + path = uri.path.to_s + + if uri.host&.include?('indico') + ensure_events_ics_path(uri) + elsif path.match?(%r{/(event|events)\.ics\z}) + uri.to_s + else + ensure_ical_query(uri) + end + end + + # Ensures the Indico URL ends with '/events.ics' + def ensure_events_ics_path(uri) + if uri.path&.include?('event') + uri.path = File.join(uri.path, 'event.ics') unless uri.path.end_with?('/event.ics') + elsif uri.path&.include?('category') + uri.path = File.join(uri.path, 'events.ics') unless uri.path.end_with?('/events.ics') + end + uri.to_s + end + + # Ensures the URL has '?ical=true' in its query params + def ensure_ical_query(uri) + query = URI.decode_www_form(uri.query.to_s).to_h + query['ical'] = 'true' unless query['ical'] == 'true' + uri.query = URI.encode_www_form(query) + uri.to_s + end + end + end +end diff --git a/lib/ingestors/concerns/sitemap_helpers.rb b/lib/ingestors/concerns/sitemap_helpers.rb new file mode 100644 index 000000000..c6e1b251d --- /dev/null +++ b/lib/ingestors/concerns/sitemap_helpers.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +module Ingestors + module Concerns + # From a sitemap.{xml|txt} or a single URL, get the list of URLs (= sources) + module SitemapHelpers + private + + # Reads either a sitemap.{xml|txt} or a single URL + # Returns a list of URLs from 1 to n URLs + def get_sources(source_url) + case source_url.downcase + when /sitemap(.*)?\.xml\Z/ + parse_xml_sitemap(source_url) + when /sitemap(.*)?\.txt\Z/ + parse_txt_sitemap(source_url) + else + [source_url] + end + end + + def parse_xml_sitemap(url) + urls = SitemapParser.new( + url, + recurse: true, + headers: { 'User-Agent' => config[:user_agent] } + ).to_a.uniq.map(&:strip) + + log_sitemap('xml', url, urls.count) + urls + rescue StandardError => e + @messages << "Extract from sitemap[#{url}] failed with: #{e.message}" + nil + end + + def parse_txt_sitemap(url) + urls = open_url(url).to_a.uniq.map(&:strip) + + log_sitemap('txt', url, urls.count) + urls + end + + def log_sitemap(ext, url, count) + @messages << "Parsing .#{ext} sitemap: #{url}\n - #{count} URLs found" + end + end + end +end diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb index e91fdecfa..0621b94f1 100644 --- a/lib/ingestors/ical_ingestor.rb +++ b/lib/ingestors/ical_ingestor.rb @@ -1,130 +1,125 @@ +# frozen_string_literal: true + require 'icalendar' require 'nokogiri' require 'open-uri' require 'tzinfo' module Ingestors + # Reads from direct ical / .ics / Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps. class IcalIngestor < Ingestor + include Ingestors::Concerns::SitemapHelpers + include Ingestors::Concerns::IcalIngestorExportUrl + def self.config { key: 'ical', - title: 'iCalendar', + title: 'iCalendar / Indico / .ics File', category: :events } end - def read(url) - unless url.nil? - if url.to_s.downcase.end_with? 'sitemap.xml' - process_sitemap url - else - process_icalendar url - end + def read(source_url) + @verbose = false + sources = get_sources(source_url) + return if sources.nil? + + sources.each do |url| + process_url(url) end end private - def process_sitemap(url) - # find urls for individual icalendar files - begin - sitemap = Nokogiri::XML.parse(open_url(url, raise: true)) - locs = sitemap.xpath('/ns:urlset/ns:url/ns:loc', { - 'ns' => 'http://www.sitemaps.org/schemas/sitemap/0.9' - }) - locs.each do |loc| - process_icalendar(loc.text) - end - rescue Exception => e - @messages << "Extract from sitemap[#{url}] failed with: #{e.message}" + # Modifies the given URL to the ics or ical export. + # Loops into each Ical event to process it. + # Note: One .ics file can have multiple Ical events. + def process_url(url) + export_url = to_export(url) + events = Icalendar::Event.parse(open_url(export_url, raise: true).set_encoding('utf-8')) + events.each do |e| + process_calevent(e) end + rescue StandardError => e + @messages << "Process file url[#{export_url}] failed with: #{e.message}" + end - # finished - nil + # Builds the OpenStruct event and adds it in event. + def process_calevent(calevent) + event_to_add = OpenStruct.new.tap do |event| + assign_basic_info(event, calevent) + assign_time_info(event, calevent) + assign_location_info(event, calevent.location) + end + add_event(event_to_add) + rescue StandardError => e + @messages << "Process iCalendar failed with: #{e.message}" end - def process_icalendar(url) - # process individual ics file - query = '?ical=true' + # Assigns to event: url, title, description, keywords. + def assign_basic_info(event, calevent) + event.url = calevent.url.to_s + event.title = calevent.summary.to_s + event.description = process_description calevent.description + event.keywords = process_keywords(calevent.categories) + end - begin - # append query (if required) - file_url = url - file_url << query unless url.to_s.downcase.ends_with? query + # Assigns to event: start, end, timezone. + def assign_time_info(event, calevent) + event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil? + event.end = calevent.dtend&.to_time unless calevent.dtend.nil? + event.timezone = get_tzid(calevent.dtstart) + end - # process file - events = Icalendar::Event.parse(open_url(file_url, raise: true).set_encoding('utf-8')) + # Assigns to event: venue, online, city. + def assign_location_info(event, location) + return if location.blank? || !location.present? - # process each event - events.each do |e| - process_event(e) - end - rescue Exception => e - @messages << "Process file url[#{file_url}] failed with: #{e.message}" - end + event.venue = location.to_s + event.online = location.downcase.include?('online') + event.city, event.postcode, event.country = process_location(location) + end - # finished - nil + # Removes all `
` tags and converts HTML to MD. + def process_description(input) + return input if input.nil? + + desc = input.to_s.gsub('', '
') + convert_description(desc) end - def process_event(calevent) - # puts "calevent: #{calevent.inspect}" - begin - # set fields - event = OpenStruct.new - event.url = calevent.url.to_s - event.title = calevent.summary.to_s - event.description = process_description calevent.description - - # puts "\n\ncalevent.description = #{calevent.description}" - # puts "\n\n... converted = #{event.description}" - - event.end = calevent.dtend&.to_time - unless calevent.dtstart.nil? - dtstart = calevent.dtstart - event.start = dtstart&.to_time - tzid = dtstart.ical_params['tzid'] - event.timezone = tzid.first.to_s if !tzid.nil? and tzid.size > 0 - end - - event.venue = calevent.location.to_s - if calevent.location.downcase.include?('online') - event.online = true - event.city = nil - event.postcode = nil - event.country = nil - else - location = convert_location(calevent.location) - event.city = location['suburb'] unless location['suburb'].nil? - event.country = location['country'] unless location['country'].nil? - event.postcode = location['postcode'] unless location['postcode'].nil? - end - event.keywords = [] - unless calevent.categories.nil? or calevent.categories.first.nil? - cats = calevent.categories.first - if cats.is_a?(Icalendar::Values::Array) - cats.each do |item| - event.keywords << item.to_s.lstrip - end - else - event.keywords << cats.to_s.strip - end - end - - # store event - @events << event - rescue Exception => e - @messages << "Process iCalendar failed with: #{e.message}" - end + # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. + # Handles whether tzid shows up as an Array or a single string + def get_tzid(dtstart) + return nil unless dtstart.respond_to?(:ical_params) - # finished - nil + tzid = dtstart.ical_params['tzid'] + return nil if tzid.nil? + + tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s end - def process_description(input) - return input if input.nil? + # Returns an array of 3 location characteristics: suburb, postcode, country + # Everything is nil if location.blank or location is online + def process_location(location) + return [nil, nil, nil] if location.blank? + + if location.to_s.downcase.include?('online') + [nil, nil, nil] + else + [ + location['suburb'], + location['postcode'], + location['country'] + ] + end + end + + # Returns keywords from the `CATEGORIES` ICal field + def process_keywords(categories) + return [] if categories.blank? - convert_description(input.to_s.gsub(/\R/, '
')) + categories.flatten.compact.map { |cat| cat.to_s.strip } end end end diff --git a/test/unit/ingestors/ical_ingestor_test.rb b/test/unit/ingestors/ical_ingestor_test.rb index cb6ff5a54..8a8aaae4c 100644 --- a/test/unit/ingestors/ical_ingestor_test.rb +++ b/test/unit/ingestors/ical_ingestor_test.rb @@ -26,7 +26,7 @@ class IcalIngestorTest < ActiveSupport::TestCase assert ingestor.events.empty? assert ingestor.materials.empty? - assert_includes ingestor.messages, 'Extract from sitemap[https://missing.org/sitemap.xml] failed with: 404 ' + assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:' end test 'ingest valid sitemap' do @@ -187,6 +187,58 @@ class IcalIngestorTest < ActiveSupport::TestCase end end + test 'process_calevent logs error when exception is raised' do + ingestor = Ingestors::IcalIngestor.new + calevent = Object.new # fake calevent + + # Stub a method that will raise an error + def ingestor.assign_basic_info(*) + raise StandardError, 'test failure' + end + + ingestor.send(:process_calevent, calevent) + + assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure' + end + + test 'to_export method' do + ingestor = Ingestors::IcalIngestor.new + indico_url_event = 'https://indico.cern.ch/event/1588342/' + indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular + indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone' + indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone' + indico_url_category = 'https://indico.cern.ch/category/19377/' + indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural + indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d' + indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d' + url_with_ics = 'https://mywebsite.com/event/blabla/events.ics' + url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born' + url_no_ical = 'https://mywebsite.com/event/blabla' + url_with_ical = 'https://mywebsite.com/event/blabla?ical=true' + + # When indico link – event + assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics + assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics + + # When indico link – category + assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics + assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics + + # When non-indico link + assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same + assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same + + # When indico link which already has the /events.ics + assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is + assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is + + # When other url, adds the ical query param + assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical + + # When other url with ical query param, keep it as-is + assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical + end + private def check_event_exists(title, url) From 59795140a1ccbb0083ade7a74c702d8595358b62 Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Mon, 27 Oct 2025 14:07:17 +0100 Subject: [PATCH 02/10] review(#1161): path more precise, location check, added contact --- .../concerns/ical_ingestor_export_url.rb | 49 ------------- lib/ingestors/ical_ingestor.rb | 69 +++++++++++++------ test/unit/ingestors/ical_ingestor_test.rb | 6 +- 3 files changed, 50 insertions(+), 74 deletions(-) delete mode 100644 lib/ingestors/concerns/ical_ingestor_export_url.rb diff --git a/lib/ingestors/concerns/ical_ingestor_export_url.rb b/lib/ingestors/concerns/ical_ingestor_export_url.rb deleted file mode 100644 index 65968fa55..000000000 --- a/lib/ingestors/concerns/ical_ingestor_export_url.rb +++ /dev/null @@ -1,49 +0,0 @@ -# frozen_string_literal: true - -module Ingestors - module Concerns - # Gets the proper URL to export an ics or ical - module IcalIngestorExportUrl - private - - # 1. If the host includes 'indico', ensures the path ends with '/events.ics'. - # 2. If the path already ends with '/events.ics', return as-is. - # 3. Otherwise, append '?ical=true' query param if not already present. - # - # This method never mutates the original URL string. - # Returns the updated URL string or nil if input is blank. - def to_export(url) - return nil if url.blank? - - uri = URI.parse(url) - path = uri.path.to_s - - if uri.host&.include?('indico') - ensure_events_ics_path(uri) - elsif path.match?(%r{/(event|events)\.ics\z}) - uri.to_s - else - ensure_ical_query(uri) - end - end - - # Ensures the Indico URL ends with '/events.ics' - def ensure_events_ics_path(uri) - if uri.path&.include?('event') - uri.path = File.join(uri.path, 'event.ics') unless uri.path.end_with?('/event.ics') - elsif uri.path&.include?('category') - uri.path = File.join(uri.path, 'events.ics') unless uri.path.end_with?('/events.ics') - end - uri.to_s - end - - # Ensures the URL has '?ical=true' in its query params - def ensure_ical_query(uri) - query = URI.decode_www_form(uri.query.to_s).to_h - query['ical'] = 'true' unless query['ical'] == 'true' - uri.query = URI.encode_www_form(query) - uri.to_s - end - end - end -end diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb index 0621b94f1..bb2b36ce4 100644 --- a/lib/ingestors/ical_ingestor.rb +++ b/lib/ingestors/ical_ingestor.rb @@ -9,7 +9,6 @@ module Ingestors # Reads from direct ical / .ics / Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps. class IcalIngestor < Ingestor include Ingestors::Concerns::SitemapHelpers - include Ingestors::Concerns::IcalIngestorExportUrl def self.config { @@ -37,6 +36,8 @@ def read(source_url) def process_url(url) export_url = to_export(url) events = Icalendar::Event.parse(open_url(export_url, raise: true).set_encoding('utf-8')) + raise 'Not found' if events.nil? || events.empty? + events.each do |e| process_calevent(e) end @@ -44,6 +45,47 @@ def process_url(url) @messages << "Process file url[#{export_url}] failed with: #{e.message}" end + # 1. If the path already ends with '/events.ics', return as-is. + # 2. If the host includes 'indico', ensures the path ends with '/events.ics'. + # 3. Otherwise, append '?ical=true' query param if not already present. + # + # This method never mutates the original URL string. + # Returns the updated URL string or nil if input is blank. + def to_export(url) + return nil if url.blank? + + uri = URI.parse(url) + path = uri.path.to_s + + if path.match?(%r{/(event|events)\.ics\z}) + uri.to_s + elsif uri.host&.include?('indico') + ensure_events_ics_path(uri) + else + ensure_ical_query(uri) + end + end + + # Ensures the Indico URL ends with '/events.ics' + def ensure_events_ics_path(uri) + paths = uri.path.split('/') + uri.path = "#{paths[0..2].join('/')}/" + if paths[1] == 'event' + uri.path = File.join(uri.path, 'event.ics') + elsif paths[1] == 'category' + uri.path = File.join(uri.path, 'events.ics') + end + uri.to_s + end + + # Ensures the URL has '?ical=true' in its query params + def ensure_ical_query(uri) + query = URI.decode_www_form(uri.query.to_s).to_h + query['ical'] = 'true' unless query['ical'] == 'true' + uri.query = URI.encode_www_form(query) + uri.to_s + end + # Builds the OpenStruct event and adds it in event. def process_calevent(calevent) event_to_add = OpenStruct.new.tap do |event| @@ -60,8 +102,9 @@ def process_calevent(calevent) def assign_basic_info(event, calevent) event.url = calevent.url.to_s event.title = calevent.summary.to_s - event.description = process_description calevent.description + event.description = calevent.description.to_s event.keywords = process_keywords(calevent.categories) + event.contact = calevent.contact.join(', ') end # Assigns to event: start, end, timezone. @@ -73,21 +116,13 @@ def assign_time_info(event, calevent) # Assigns to event: venue, online, city. def assign_location_info(event, location) - return if location.blank? || !location.present? + return if location.blank? event.venue = location.to_s event.online = location.downcase.include?('online') event.city, event.postcode, event.country = process_location(location) end - # Removes all `
` tags and converts HTML to MD. - def process_description(input) - return input if input.nil? - - desc = input.to_s.gsub('', '
') - convert_description(desc) - end - # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. # Handles whether tzid shows up as an Array or a single string def get_tzid(dtstart) @@ -102,17 +137,9 @@ def get_tzid(dtstart) # Returns an array of 3 location characteristics: suburb, postcode, country # Everything is nil if location.blank or location is online def process_location(location) - return [nil, nil, nil] if location.blank? + return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array) - if location.to_s.downcase.include?('online') - [nil, nil, nil] - else - [ - location['suburb'], - location['postcode'], - location['country'] - ] - end + [nil, nil, nil] end # Returns keywords from the `CATEGORIES` ICal field diff --git a/test/unit/ingestors/ical_ingestor_test.rb b/test/unit/ingestors/ical_ingestor_test.rb index 8a8aaae4c..17ebc4cf0 100644 --- a/test/unit/ingestors/ical_ingestor_test.rb +++ b/test/unit/ingestors/ical_ingestor_test.rb @@ -192,12 +192,10 @@ class IcalIngestorTest < ActiveSupport::TestCase calevent = Object.new # fake calevent # Stub a method that will raise an error - def ingestor.assign_basic_info(*) - raise StandardError, 'test failure' + ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do + ingestor.send(:process_calevent, calevent) end - ingestor.send(:process_calevent, calevent) - assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure' end From f3f16b97e0fb14890e94aebc5c6e53c3d9046727 Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Mon, 27 Oct 2025 15:55:51 +0100 Subject: [PATCH 03/10] feat(indico-ingestor): API token auth --- config/secrets.example.yml | 1 + lib/ingestors/ical_ingestor.rb | 4 +++- lib/ingestors/ingestor.rb | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/config/secrets.example.yml b/config/secrets.example.yml index 018b0439e..718f5ad0f 100644 --- a/config/secrets.example.yml +++ b/config/secrets.example.yml @@ -38,6 +38,7 @@ external_api_keys: &external_api_keys password: gpt_api_key: willma_api_key: + indico_api_token: # begins by 'indp_', cf. https://docs.getindico.io/en/stable/http-api/access/#api-token-authentication #Internal config development: diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb index bb2b36ce4..647af15c1 100644 --- a/lib/ingestors/ical_ingestor.rb +++ b/lib/ingestors/ical_ingestor.rb @@ -19,6 +19,7 @@ def self.config end def read(source_url) + @token << Rails.application.config.secrets.indico_api_token @verbose = false sources = get_sources(source_url) return if sources.nil? @@ -35,7 +36,8 @@ def read(source_url) # Note: One .ics file can have multiple Ical events. def process_url(url) export_url = to_export(url) - events = Icalendar::Event.parse(open_url(export_url, raise: true).set_encoding('utf-8')) + content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8') + events = Icalendar::Event.parse(content) raise 'Not found' if events.nil? || events.empty? events.each do |e| diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb index 8f043f135..e552f16fe 100644 --- a/lib/ingestors/ingestor.rb +++ b/lib/ingestors/ingestor.rb @@ -50,13 +50,14 @@ def stats_summary(type) summary end - def open_url(url, raise: false) + def open_url(url, token: '', raise: false) options = { redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection read_timeout: 5 } options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode) redirect_attempts = 5 + options['Authorization'] = "Bearer #{token}" if token begin URI(url).open(options) rescue OpenURI::HTTPRedirect => e From 529009ba830d2a4272bac70ae9677494c70e6565 Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Mon, 27 Oct 2025 16:33:37 +0100 Subject: [PATCH 04/10] fixtests(indico-ingestor): typo --- lib/ingestors/ical_ingestor.rb | 2 +- lib/ingestors/ingestor.rb | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb index 647af15c1..09fdb8ae6 100644 --- a/lib/ingestors/ical_ingestor.rb +++ b/lib/ingestors/ical_ingestor.rb @@ -19,7 +19,7 @@ def self.config end def read(source_url) - @token << Rails.application.config.secrets.indico_api_token + @token = Rails.application.config.secrets.indico_api_token @verbose = false sources = get_sources(source_url) return if sources.nil? diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb index e552f16fe..a662985cf 100644 --- a/lib/ingestors/ingestor.rb +++ b/lib/ingestors/ingestor.rb @@ -50,14 +50,14 @@ def stats_summary(type) summary end - def open_url(url, token: '', raise: false) + def open_url(url, raise: false, token: nil) options = { redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection read_timeout: 5 } options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode) redirect_attempts = 5 - options['Authorization'] = "Bearer #{token}" if token + options['Authorization'] = "Bearer #{token}" unless token.nil? begin URI(url).open(options) rescue OpenURI::HTTPRedirect => e From 5b9ab3f676d848e07cb636c76de5c2d2508812a0 Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Fri, 21 Nov 2025 09:52:41 +0100 Subject: [PATCH 05/10] revert(ical_ingestor): to before the indico changes --- lib/ingestors/ical_ingestor.rb | 212 ++++++++++------------ test/unit/ingestors/ical_ingestor_test.rb | 52 +----- 2 files changed, 95 insertions(+), 169 deletions(-) diff --git a/lib/ingestors/ical_ingestor.rb b/lib/ingestors/ical_ingestor.rb index 09fdb8ae6..e91fdecfa 100644 --- a/lib/ingestors/ical_ingestor.rb +++ b/lib/ingestors/ical_ingestor.rb @@ -1,154 +1,130 @@ -# frozen_string_literal: true - require 'icalendar' require 'nokogiri' require 'open-uri' require 'tzinfo' module Ingestors - # Reads from direct ical / .ics / Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps. class IcalIngestor < Ingestor - include Ingestors::Concerns::SitemapHelpers - def self.config { key: 'ical', - title: 'iCalendar / Indico / .ics File', + title: 'iCalendar', category: :events } end - def read(source_url) - @token = Rails.application.config.secrets.indico_api_token - @verbose = false - sources = get_sources(source_url) - return if sources.nil? - - sources.each do |url| - process_url(url) + def read(url) + unless url.nil? + if url.to_s.downcase.end_with? 'sitemap.xml' + process_sitemap url + else + process_icalendar url + end end end private - # Modifies the given URL to the ics or ical export. - # Loops into each Ical event to process it. - # Note: One .ics file can have multiple Ical events. - def process_url(url) - export_url = to_export(url) - content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8') - events = Icalendar::Event.parse(content) - raise 'Not found' if events.nil? || events.empty? - - events.each do |e| - process_calevent(e) - end - rescue StandardError => e - @messages << "Process file url[#{export_url}] failed with: #{e.message}" - end - - # 1. If the path already ends with '/events.ics', return as-is. - # 2. If the host includes 'indico', ensures the path ends with '/events.ics'. - # 3. Otherwise, append '?ical=true' query param if not already present. - # - # This method never mutates the original URL string. - # Returns the updated URL string or nil if input is blank. - def to_export(url) - return nil if url.blank? - - uri = URI.parse(url) - path = uri.path.to_s - - if path.match?(%r{/(event|events)\.ics\z}) - uri.to_s - elsif uri.host&.include?('indico') - ensure_events_ics_path(uri) - else - ensure_ical_query(uri) - end - end - - # Ensures the Indico URL ends with '/events.ics' - def ensure_events_ics_path(uri) - paths = uri.path.split('/') - uri.path = "#{paths[0..2].join('/')}/" - if paths[1] == 'event' - uri.path = File.join(uri.path, 'event.ics') - elsif paths[1] == 'category' - uri.path = File.join(uri.path, 'events.ics') - end - uri.to_s - end - - # Ensures the URL has '?ical=true' in its query params - def ensure_ical_query(uri) - query = URI.decode_www_form(uri.query.to_s).to_h - query['ical'] = 'true' unless query['ical'] == 'true' - uri.query = URI.encode_www_form(query) - uri.to_s - end - - # Builds the OpenStruct event and adds it in event. - def process_calevent(calevent) - event_to_add = OpenStruct.new.tap do |event| - assign_basic_info(event, calevent) - assign_time_info(event, calevent) - assign_location_info(event, calevent.location) + def process_sitemap(url) + # find urls for individual icalendar files + begin + sitemap = Nokogiri::XML.parse(open_url(url, raise: true)) + locs = sitemap.xpath('/ns:urlset/ns:url/ns:loc', { + 'ns' => 'http://www.sitemaps.org/schemas/sitemap/0.9' + }) + locs.each do |loc| + process_icalendar(loc.text) + end + rescue Exception => e + @messages << "Extract from sitemap[#{url}] failed with: #{e.message}" end - add_event(event_to_add) - rescue StandardError => e - @messages << "Process iCalendar failed with: #{e.message}" - end - # Assigns to event: url, title, description, keywords. - def assign_basic_info(event, calevent) - event.url = calevent.url.to_s - event.title = calevent.summary.to_s - event.description = calevent.description.to_s - event.keywords = process_keywords(calevent.categories) - event.contact = calevent.contact.join(', ') + # finished + nil end - # Assigns to event: start, end, timezone. - def assign_time_info(event, calevent) - event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil? - event.end = calevent.dtend&.to_time unless calevent.dtend.nil? - event.timezone = get_tzid(calevent.dtstart) - end - - # Assigns to event: venue, online, city. - def assign_location_info(event, location) - return if location.blank? + def process_icalendar(url) + # process individual ics file + query = '?ical=true' - event.venue = location.to_s - event.online = location.downcase.include?('online') - event.city, event.postcode, event.country = process_location(location) - end + begin + # append query (if required) + file_url = url + file_url << query unless url.to_s.downcase.ends_with? query - # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. - # Handles whether tzid shows up as an Array or a single string - def get_tzid(dtstart) - return nil unless dtstart.respond_to?(:ical_params) + # process file + events = Icalendar::Event.parse(open_url(file_url, raise: true).set_encoding('utf-8')) - tzid = dtstart.ical_params['tzid'] - return nil if tzid.nil? + # process each event + events.each do |e| + process_event(e) + end + rescue Exception => e + @messages << "Process file url[#{file_url}] failed with: #{e.message}" + end - tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s + # finished + nil end - # Returns an array of 3 location characteristics: suburb, postcode, country - # Everything is nil if location.blank or location is online - def process_location(location) - return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array) + def process_event(calevent) + # puts "calevent: #{calevent.inspect}" + begin + # set fields + event = OpenStruct.new + event.url = calevent.url.to_s + event.title = calevent.summary.to_s + event.description = process_description calevent.description + + # puts "\n\ncalevent.description = #{calevent.description}" + # puts "\n\n... converted = #{event.description}" + + event.end = calevent.dtend&.to_time + unless calevent.dtstart.nil? + dtstart = calevent.dtstart + event.start = dtstart&.to_time + tzid = dtstart.ical_params['tzid'] + event.timezone = tzid.first.to_s if !tzid.nil? and tzid.size > 0 + end + + event.venue = calevent.location.to_s + if calevent.location.downcase.include?('online') + event.online = true + event.city = nil + event.postcode = nil + event.country = nil + else + location = convert_location(calevent.location) + event.city = location['suburb'] unless location['suburb'].nil? + event.country = location['country'] unless location['country'].nil? + event.postcode = location['postcode'] unless location['postcode'].nil? + end + event.keywords = [] + unless calevent.categories.nil? or calevent.categories.first.nil? + cats = calevent.categories.first + if cats.is_a?(Icalendar::Values::Array) + cats.each do |item| + event.keywords << item.to_s.lstrip + end + else + event.keywords << cats.to_s.strip + end + end + + # store event + @events << event + rescue Exception => e + @messages << "Process iCalendar failed with: #{e.message}" + end - [nil, nil, nil] + # finished + nil end - # Returns keywords from the `CATEGORIES` ICal field - def process_keywords(categories) - return [] if categories.blank? + def process_description(input) + return input if input.nil? - categories.flatten.compact.map { |cat| cat.to_s.strip } + convert_description(input.to_s.gsub(/\R/, '
')) end end end diff --git a/test/unit/ingestors/ical_ingestor_test.rb b/test/unit/ingestors/ical_ingestor_test.rb index 17ebc4cf0..cb6ff5a54 100644 --- a/test/unit/ingestors/ical_ingestor_test.rb +++ b/test/unit/ingestors/ical_ingestor_test.rb @@ -26,7 +26,7 @@ class IcalIngestorTest < ActiveSupport::TestCase assert ingestor.events.empty? assert ingestor.materials.empty? - assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:' + assert_includes ingestor.messages, 'Extract from sitemap[https://missing.org/sitemap.xml] failed with: 404 ' end test 'ingest valid sitemap' do @@ -187,56 +187,6 @@ class IcalIngestorTest < ActiveSupport::TestCase end end - test 'process_calevent logs error when exception is raised' do - ingestor = Ingestors::IcalIngestor.new - calevent = Object.new # fake calevent - - # Stub a method that will raise an error - ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do - ingestor.send(:process_calevent, calevent) - end - - assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure' - end - - test 'to_export method' do - ingestor = Ingestors::IcalIngestor.new - indico_url_event = 'https://indico.cern.ch/event/1588342/' - indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular - indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone' - indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone' - indico_url_category = 'https://indico.cern.ch/category/19377/' - indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural - indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d' - indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d' - url_with_ics = 'https://mywebsite.com/event/blabla/events.ics' - url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born' - url_no_ical = 'https://mywebsite.com/event/blabla' - url_with_ical = 'https://mywebsite.com/event/blabla?ical=true' - - # When indico link – event - assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics - assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics - - # When indico link – category - assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics - assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics - - # When non-indico link - assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same - assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same - - # When indico link which already has the /events.ics - assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is - assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is - - # When other url, adds the ical query param - assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical - - # When other url with ical query param, keep it as-is - assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical - end - private def check_event_exists(title, url) From af243e3e493e365e1ddae4da74e4f516dd29fa08 Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Tue, 2 Dec 2025 11:54:00 +0100 Subject: [PATCH 06/10] review(#1161): Indico ingestor as a separate class --- lib/ingestors/indico_ingestor.rb | 167 +++++++++++++ lib/ingestors/ingestor_factory.rb | 1 + test/unit/ingestors/indico_ingestor_test.rb | 248 ++++++++++++++++++++ 3 files changed, 416 insertions(+) create mode 100644 lib/ingestors/indico_ingestor.rb create mode 100644 test/unit/ingestors/indico_ingestor_test.rb diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb new file mode 100644 index 000000000..2ee1cd1a3 --- /dev/null +++ b/lib/ingestors/indico_ingestor.rb @@ -0,0 +1,167 @@ +# frozen_string_literal: true + +require 'icalendar' +require 'nokogiri' +require 'open-uri' +require 'tzinfo' + +module Ingestors + # Reads from direct .ics or Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps. + class IndicoIngestor < Ingestor + include Ingestors::Concerns::SitemapHelpers + + def self.config + { + key: 'indico', + title: 'Indico / .ics file', + category: :events + } + end + + def read(source_url) + @token = Rails.application.config.secrets.indico_api_token + @verbose = false + sources = get_sources(source_url) + return if sources.nil? + + sources.each do |url| + process_url(url) + end + end + + private + + # Modifies the given URL to the ics export. + # Loops into each Ical event to process it. + # Note: One .ics file can have multiple Ical events. + def process_url(url) + export_url = to_export(url) + raise 'Not an indico link' if export_url.nil? + + content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8') + events = Icalendar::Event.parse(content) + raise 'Not found' if events.nil? || events.empty? + + events.each do |e| + process_calevent(e) + end + rescue StandardError => e + @messages << "Process file url[#{export_url}] failed with: #{e.message}" + end + + # 1. If the path already ends with '/events.ics', return as-is. + # 2. If the host includes 'indico', ensures the path ends with '/events.ics'. + # 3. Otherwise, append '?ical=true' query param if not already present. + # + # This method never mutates the original URL string. + # Returns the updated URL string or nil if input is blank. + def to_export(url) + return nil if url.blank? + + uri = URI.parse(url) + path = uri.path.to_s + + if path.match?(%r{/(event|events)\.ics\z}) + uri.to_s + elsif indico_page?(uri) + ensure_events_ics_path(uri) + else + nil + end + end + + def indico_page?(uri) + # Either checks in host, e.g., 'indico.myinstitution.com' + return true if uri.host&.include?('indico') + + # Or checks in meta tags + html = open_url(uri, raise: true) + doc = Nokogiri::HTML(html) + content = doc.at('meta[property="og:site_name"]')&.[]('content') + content&.match?(/indico/i) + end + + # Ensures the Indico URL ends with '/events.ics' + def ensure_events_ics_path(uri) + paths = uri.path.split('/') + uri.path = "#{paths[0..2].join('/')}/" + if paths[1] == 'event' + uri.path = File.join(uri.path, 'event.ics') + elsif paths[1] == 'category' + uri.path = File.join(uri.path, 'events.ics') + end + uri.to_s + end + + # Ensures the URL has '?ical=true' in its query params + def ensure_ical_query(uri) + query = URI.decode_www_form(uri.query.to_s).to_h + query['ical'] = 'true' unless query['ical'] == 'true' + uri.query = URI.encode_www_form(query) + uri.to_s + end + + # Builds the OpenStruct event and adds it in event. + def process_calevent(calevent) + event_to_add = OpenStruct.new.tap do |event| + assign_basic_info(event, calevent) + assign_time_info(event, calevent) + assign_location_info(event, calevent.location) + end + add_event(event_to_add) + rescue StandardError => e + @messages << "Process iCalendar failed with: #{e.message}" + end + + # Assigns to event: url, title, description, keywords. + def assign_basic_info(event, calevent) + event.url = calevent.url.to_s + event.title = calevent.summary.to_s + event.description = calevent.description.to_s + event.keywords = process_keywords(calevent.categories) + event.contact = calevent.contact.join(', ') + end + + # Assigns to event: start, end, timezone. + def assign_time_info(event, calevent) + event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil? + event.end = calevent.dtend&.to_time unless calevent.dtend.nil? + event.timezone = get_tzid(calevent.dtstart) + end + + # Assigns to event: venue, online, city. + def assign_location_info(event, location) + return if location.blank? + + event.venue = location.to_s + event.online = location.downcase.include?('online') + event.city, event.postcode, event.country = process_location(location) + end + + # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. + # Handles whether tzid shows up as an Array or a single string + def get_tzid(dtstart) + return nil unless dtstart.respond_to?(:ical_params) + + tzid = dtstart.ical_params['tzid'] + return nil if tzid.nil? + + tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s + end + + # Returns an array of 3 location characteristics: suburb, postcode, country + # Everything is nil if location.blank or location is online + def process_location(location) + return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array) + + [nil, nil, nil] + end + + # Returns keywords from the `CATEGORIES` ICal field + def process_keywords(categories) + return [] if categories.blank? + + categories.flatten.compact.map { |cat| cat.to_s.strip } + end + end +end diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index 67e818d02..b2cafda9f 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -6,6 +6,7 @@ def self.ingestors Ingestors::EventbriteIngestor, Ingestors::EventCsvIngestor, Ingestors::IcalIngestor, + Ingestors::IndicoIngestor, Ingestors::LibcalIngestor, Ingestors::MaterialCsvIngestor, Ingestors::TessEventIngestor, diff --git a/test/unit/ingestors/indico_ingestor_test.rb b/test/unit/ingestors/indico_ingestor_test.rb new file mode 100644 index 000000000..eaf19191a --- /dev/null +++ b/test/unit/ingestors/indico_ingestor_test.rb @@ -0,0 +1,248 @@ +require 'test_helper' + +class IndicoIngestorTest < ActiveSupport::TestCase + setup do + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + mock_ingestions + # mock_nominatim + mock_timezone # System time zone should not affect test result + end + + teardown do + reset_timezone + end + + test 'sitemap not found' do + source = @content_provider.sources.build(url: 'https://missing.org/sitemap.xml', + method: 'ical', + enabled: true) + ingestor = Ingestors::IcalIngestor.new + + assert_no_difference('Event.count') do + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + end + + assert ingestor.events.empty? + assert ingestor.materials.empty? + assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:' + end + + test 'ingest valid sitemap' do + source = @content_provider.sources.build(url: 'https://app.com/events/sitemap.xml', + method: 'ical', + enabled: true) + ingestor = Ingestors::IcalIngestor.new + + # check two events to be updated + name = 'ical_event_1' + event = events(:ical_event_1) + refute event.nil?, "event[#{name}] not found" + refute event.online?, "event[#{name}] online not matched" + assert_equal 'Another Portal Provider', event.content_provider.title, + "event[#{name}] content provider not matched" + + name = 'ical_event_2' + refute events(name).nil?, "fixture[#{name}] not found" + title = 'PaCER Seminar: Computational Fluid Dynamics' + url = 'https://pawsey.org.au/event/pacer-seminar-computational-fluid-dynamics/' + event = check_event_exists title, url + refute event.nil?, "event title[#{title}] not found" + refute event.online?, "event title[#{title}] online not matched" + assert_equal 'Another Portal Provider', event.content_provider.title, + "event title[#{title}] content provider not matched" + + assert_difference('Event.count', 4) do + freeze_time(2019) do + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + end + end + + assert_equal 8, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 4, ingestor.stats[:events][:added] + assert_equal 2, ingestor.stats[:events][:updated] + assert_equal 2, ingestor.stats[:events][:rejected] + + # check individual events + # check not found + assert_includes ingestor.messages, "Process file url\[https://pawsey.org.au/events/\?ical=true\] failed with: 404 " + + # check rejected + event = ingestor.events.detect { |e| e.title == 'NVIDIA cuQuantum Session' } + assert event + assert event.errors.added?(:url, :url, value: '123') + event = ingestor.events.detect { |e| e.title == 'PaCER Seminar: Radio astronomy' } + assert event + assert event.errors.added?(:url, :blank) + + # check added + title = 'Ask Me Anything: Porous media visualisation and LBPM' + event = check_event_exists title, 'https://pawsey.org.au/event/ask-me-anything-porous-media-visualisation-and-lbpm/' + assert event.online?, "event title[#{event.title}] online not matched" + assert (!event.keywords.nil? and event.keywords.size == 2), "event title[#{event.title}] keywords.size not matched" + assert event.keywords.include?('AMA'), "event title[#{event.title}] keyword[AMA] not found" + assert event.keywords.include?('Visualisation'), "event title[#{event.title}] keyword[Visualisation] not found" + + title = 'Pawsey Intern Showcase 2022' + event = check_event_exists title, 'https://pawsey.org.au/event/pawsey-intern-showcase-2022/' + assert_includes event.description, 'The Pawsey Supercomputing Research Centre takes prides in its Summer Internship Program' + assert_includes event.description, 'range of trainings we immerse students in during Week 1 of the Program (and throughout).' + assert_equal 'Perth', event.timezone.to_s, "event title[#{event.title}] timezone not matched" + assert_equal '2022-02-11 01:45:00 UTC', event.start.utc.to_s, "event title[#{event.title}] start not matched" + assert_equal '2022-02-11 04:50:00 UTC', event.end.utc.to_s, "event title[#{event.title}] end not matched" + + title = 'P\'Con - Experience with porting and scaling codes on AMD GPUs' + event = check_event_exists title, 'https://pawsey.org.au/event/experience-with-porting-and-scaling-codes-on-amd-gpus/' + assert event.online?, "event title[#{title}] online not matched" + + title = 'Overview of High Performance Computing Resources at OLCF' + event = check_event_exists title, 'https://pawsey.org.au/event/overview-of-high-performance-computing-resources-at-olcf/' + refute event.online?, "event title[#{title}] online not matched" + location = 'Pawsey Supercomputing Centre, 1 Bryce Avenue, Kensington, Western Australia, 6151, Australia' + assert_equal location, event.venue, "event title[#{title}] venue not matched" + # Geocoding is disabled so these fail TODO: Re-enable, but using cache + rate limiting + # assert_equal 'Kensington', event.city, "event title[#{title}] city not matched" + # assert_equal '6151', event.postcode, "event title[#{title}] postcode not matched" + # assert_equal 'Australia', event.country, "event title[#{title}] country not matched" + + # TODO: check updated + title = 'PaCER Seminar: Computational Fluid Dynamics' + event = check_event_exists title, 'https://pawsey.org.au/event/pacer-seminar-computational-fluid-dynamics/' + assert_equal '2022-06-15 03:00:00 UTC', event.end.utc.to_s, "event title[#{event.title}] updated end not matched" + assert event.description != 'MyText', "event title[#{event.title}] description not updated" + assert event.description.size > 100, "event title[#{event.title}] description too short" + assert event.online?, "event title[#{event.title}] online not matched" + assert_equal 2, event.keywords.size, "event title[#{event.title}] keywords size not matched" + %w[Supercomputing Seminar].each do |keyword| + assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found" + end + assert_equal 'Online, Virtual, Australia', event.venue, "event title[#{event.title}] venue not matched" + assert event.city.nil?, "event title[#{event.title}] city not matched" + assert event.postcode.nil?, "event title[#{event.title}] postcode not matched" + assert event.country.nil?, "event title[#{event.title}] country not matched" + + title = "P'Con - Embracing new solutions for in-situ visualisation" + event = check_event_exists title, 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/' + assert event.online?, "event title[#{event.title}] online not matched" + assert_equal 3, event.keywords.size, "event title[#{event.title}] keywords size not matched" + %w[Supercomputing Conference Visualisation].each do |keyword| + assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found" + end + assert_equal 'Online, Virtual, Australia', event.venue, "event title[#{event.title}] venue not matched" + assert event.postcode.nil?, "event title[#{event.title}] postcode not matched" + assert event.city.nil?, "event title[#{event.title}] city not matched" + assert event.country.nil?, "event title[#{event.title}] country not matched" + end + + test 'check single ical sources' do + # override time + assert_no_difference 'Event.count' do + freeze_time(2019) do + ingestor = Ingestors::IcalIngestor.new + source = @content_provider.sources.build( + url: 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/?ical=true', + method: 'ical', enabled: true + ) + + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + + assert_equal 1, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 0, ingestor.stats[:events][:added] + assert_equal 1, ingestor.stats[:events][:updated] + assert_equal 0, ingestor.stats[:events][:rejected] + + ingestor = Ingestors::IcalIngestor.new + source = @content_provider.sources.build( + url: 'https://pawsey.org.au/event/pawsey-intern-showcase-2021/?ical=true', + method: 'ical', enabled: true + ) + + ingestor.read(source.url) + ingestor.write(@user, @content_provider) + + assert_equal 1, ingestor.events.count + assert ingestor.materials.empty? + assert_equal 0, ingestor.stats[:events][:added] + assert_equal 0, ingestor.stats[:events][:updated] + assert_equal 1, ingestor.stats[:events][:rejected] + + event = ingestor.events.detect { |e| e.title == 'Pawsey Intern Showcase 2021' } + assert event + assert event.errors.added?(:url, :blank) + end + end + + # get updated + title = 'P\'Con - Embracing new solutions for in-situ visualisation' + url = 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/' + event = check_event_exists title, url + assert_equal 3, event.keywords.size + %w[Supercomputing Conference Visualisation].each do |keyword| + assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found" + end + end + + test 'process_calevent logs error when exception is raised' do + ingestor = Ingestors::IcalIngestor.new + calevent = Object.new # fake calevent + + # Stub a method that will raise an error + ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do + ingestor.send(:process_calevent, calevent) + end + + assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure' + end + + test 'to_export method' do + ingestor = Ingestors::IcalIngestor.new + indico_url_event = 'https://indico.cern.ch/event/1588342/' + indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular + indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone' + indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone' + indico_url_category = 'https://indico.cern.ch/category/19377/' + indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural + indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d' + indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d' + url_with_ics = 'https://mywebsite.com/event/blabla/events.ics' + url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born' + url_no_ical = 'https://mywebsite.com/event/blabla' + url_with_ical = 'https://mywebsite.com/event/blabla?ical=true' + + # When indico link – event + assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics + assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics + + # When indico link – category + assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics + assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics + + # When non-indico link + assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same + assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same + + # When indico link which already has the /events.ics + assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is + assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is + + # When other url, adds the ical query param + assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical + + # When other url with ical query param, keep it as-is + assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical + end + + private + + def check_event_exists(title, url) + events = Event.where(title: title, url: url) + assert (!events.nil? and events.size > 0), "event title[#{title}] not found" + assert events.size < 2, "event[#{title}] duplicates found = #{events.size}" + events.first + end +end From 19fe8c24e2313ce0d87e34bd5600e8c9ef71a7fe Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:09:26 +0100 Subject: [PATCH 07/10] chore(indico_ingestor): removed references to ical --- lib/ingestors/indico_ingestor.rb | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb index 2ee1cd1a3..0a4363689 100644 --- a/lib/ingestors/indico_ingestor.rb +++ b/lib/ingestors/indico_ingestor.rb @@ -32,13 +32,13 @@ def read(source_url) private # Modifies the given URL to the ics export. - # Loops into each Ical event to process it. - # Note: One .ics file can have multiple Ical events. + # Loops into each event to process it. + # Note: One .ics file can have multiple events. def process_url(url) export_url = to_export(url) raise 'Not an indico link' if export_url.nil? - content = open_url(export_url, token: @token, raise: true).set_encoding('utf-8') + content = open_url(export_url, raise: true, token: @token).set_encoding('utf-8') events = Icalendar::Event.parse(content) raise 'Not found' if events.nil? || events.empty? @@ -51,8 +51,6 @@ def process_url(url) # 1. If the path already ends with '/events.ics', return as-is. # 2. If the host includes 'indico', ensures the path ends with '/events.ics'. - # 3. Otherwise, append '?ical=true' query param if not already present. - # # This method never mutates the original URL string. # Returns the updated URL string or nil if input is blank. def to_export(url) @@ -65,8 +63,6 @@ def to_export(url) uri.to_s elsif indico_page?(uri) ensure_events_ics_path(uri) - else - nil end end @@ -93,14 +89,6 @@ def ensure_events_ics_path(uri) uri.to_s end - # Ensures the URL has '?ical=true' in its query params - def ensure_ical_query(uri) - query = URI.decode_www_form(uri.query.to_s).to_h - query['ical'] = 'true' unless query['ical'] == 'true' - uri.query = URI.encode_www_form(query) - uri.to_s - end - # Builds the OpenStruct event and adds it in event. def process_calevent(calevent) event_to_add = OpenStruct.new.tap do |event| @@ -110,7 +98,7 @@ def process_calevent(calevent) end add_event(event_to_add) rescue StandardError => e - @messages << "Process iCalendar failed with: #{e.message}" + @messages << "process_calevent failed with: #{e.message}" end # Assigns to event: url, title, description, keywords. From 5263990f3caa86f1d293f61a8d96c39a5e3072dc Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Thu, 4 Dec 2025 11:08:01 +0100 Subject: [PATCH 08/10] fix(indico_ingestor): presence, stderr --- app/controllers/events_controller.rb | 2 +- lib/ingestors/indico_ingestor.rb | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 86ec50080..67a2ac962 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -236,7 +236,7 @@ def event_params :timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] }, { node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible, { host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives, - :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, + :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, :presence, external_resources_attributes: %i[id url title _destroy], external_resources: %i[url title], material_ids: [], llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy], diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb index 0a4363689..2e20a2ec1 100644 --- a/lib/ingestors/indico_ingestor.rb +++ b/lib/ingestors/indico_ingestor.rb @@ -94,11 +94,11 @@ def process_calevent(calevent) event_to_add = OpenStruct.new.tap do |event| assign_basic_info(event, calevent) assign_time_info(event, calevent) - assign_location_info(event, calevent.location) + assign_location_info(event, calevent) end add_event(event_to_add) rescue StandardError => e - @messages << "process_calevent failed with: #{e.message}" + Rails.logger.error("#{e.class}: #{e.message}") end # Assigns to event: url, title, description, keywords. @@ -118,11 +118,13 @@ def assign_time_info(event, calevent) end # Assigns to event: venue, online, city. - def assign_location_info(event, location) + def assign_location_info(event, calevent) + location = calevent.location return if location.blank? event.venue = location.to_s - event.online = location.downcase.include?('online') + event.online = calevent.description.include?('zoom') + event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now event.city, event.postcode, event.country = process_location(location) end From ab659e0f95c8ab542a44e79475e2f3871f7e92d4 Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Mon, 8 Dec 2025 17:25:57 +0100 Subject: [PATCH 09/10] test(indico_ingestor): added tests --- lib/ingestors/indico_ingestor.rb | 18 +- .../fixtures/files/ingestion/indico/event.ics | 16 + .../files/ingestion/indico/events.ics | 25 ++ .../files/ingestion/indico/indico.html | 16 + test/unit/ingestors/indico_ingestor_test.rb | 276 +++++------------- 5 files changed, 131 insertions(+), 220 deletions(-) create mode 100644 test/fixtures/files/ingestion/indico/event.ics create mode 100644 test/fixtures/files/ingestion/indico/events.ics create mode 100644 test/fixtures/files/ingestion/indico/indico.html diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb index 2e20a2ec1..f826fd744 100644 --- a/lib/ingestors/indico_ingestor.rb +++ b/lib/ingestors/indico_ingestor.rb @@ -106,7 +106,7 @@ def assign_basic_info(event, calevent) event.url = calevent.url.to_s event.title = calevent.summary.to_s event.description = calevent.description.to_s - event.keywords = process_keywords(calevent.categories) + event.keywords = calevent.categories.flatten event.contact = calevent.contact.join(', ') end @@ -125,7 +125,6 @@ def assign_location_info(event, calevent) event.venue = location.to_s event.online = calevent.description.include?('zoom') event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now - event.city, event.postcode, event.country = process_location(location) end # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. @@ -138,20 +137,5 @@ def get_tzid(dtstart) tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s end - - # Returns an array of 3 location characteristics: suburb, postcode, country - # Everything is nil if location.blank or location is online - def process_location(location) - return [location['suburb'], location['postcode'], location['country']] if location.is_a?(Array) - - [nil, nil, nil] - end - - # Returns keywords from the `CATEGORIES` ICal field - def process_keywords(categories) - return [] if categories.blank? - - categories.flatten.compact.map { |cat| cat.to_s.strip } - end end end diff --git a/test/fixtures/files/ingestion/indico/event.ics b/test/fixtures/files/ingestion/indico/event.ics new file mode 100644 index 000000000..58728eadf --- /dev/null +++ b/test/fixtures/files/ingestion/indico/event.ics @@ -0,0 +1,16 @@ +BEGIN:VCALENDAR +VERSION:2.0 +PRODID:-//CERN//INDICO//EN +BEGIN:VEVENT +SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials +DTSTART:20260309T080000Z +DTEND:20260313T161500Z +DTSTAMP:20251204T105300Z +UID:indico-event-1617123@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:speakers and zoom here +LOCATION:CERN +URL:https://indico.cern.ch/event/1617123/ +CATEGORIES:TRAINING,EDUCATION +END:VEVENT +END:VCALENDAR \ No newline at end of file diff --git a/test/fixtures/files/ingestion/indico/events.ics b/test/fixtures/files/ingestion/indico/events.ics new file mode 100644 index 000000000..7f3bbfd22 --- /dev/null +++ b/test/fixtures/files/ingestion/indico/events.ics @@ -0,0 +1,25 @@ +BEGIN:VCALENDAR +VERSION:2.0 +PRODID:-//CERN//INDICO//EN +BEGIN:VEVENT +SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials +DTSTART:20260309T080000Z +DTEND:20260313T161500Z +DTSTAMP:20251203T150800Z +UID:indico-event-1617123@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:speakers and zoom here +LOCATION:CERN +URL:https://indico.cern.ch/event/1617123/ +END:VEVENT +BEGIN:VEVENT +SUMMARY:HEP C++ Course and Hands-on Training - Stay Informed +DTSTART:20991231T225800Z +DTEND:20991231T225900Z +DTSTAMP:20251203T150800Z +UID:indico-event-1211412@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:mockdescription +URL:https://indico.cern.ch/event/1211412/ +END:VEVENT +END:VCALENDAR \ No newline at end of file diff --git a/test/fixtures/files/ingestion/indico/indico.html b/test/fixtures/files/ingestion/indico/indico.html new file mode 100644 index 000000000..33765bb38 --- /dev/null +++ b/test/fixtures/files/ingestion/indico/indico.html @@ -0,0 +1,16 @@ + + + + + My Agenda (Indico) + + + + + + + + + \ No newline at end of file diff --git a/test/unit/ingestors/indico_ingestor_test.rb b/test/unit/ingestors/indico_ingestor_test.rb index eaf19191a..7d9183df7 100644 --- a/test/unit/ingestors/indico_ingestor_test.rb +++ b/test/unit/ingestors/indico_ingestor_test.rb @@ -2,205 +2,67 @@ class IndicoIngestorTest < ActiveSupport::TestCase setup do + @ingestor = Ingestors::IndicoIngestor.new @user = users(:regular_user) @content_provider = content_providers(:another_portal_provider) - mock_ingestions - # mock_nominatim - mock_timezone # System time zone should not affect test result + # mock_ingestions + # mock_timezone # System time zone should not affect test result + + webmock('https://indico.cern.ch/event/1617123/', 'indico/indico.html') + webmock('https://indico.cern.ch/event/1617123/event.ics', 'indico/event.ics') + webmock('https://indico.cern.ch/category/11733/', 'indico/indico.html') + webmock('https://indico.cern.ch/category/11733/events.ics', 'indico/events.ics') + webmock('https://myagenda.com/event/1617123/', 'indico/indico.html') + webmock('https://myagenda.com/event/1617123/event.ics', 'indico/event.ics') end teardown do reset_timezone end - test 'sitemap not found' do - source = @content_provider.sources.build(url: 'https://missing.org/sitemap.xml', - method: 'ical', - enabled: true) - ingestor = Ingestors::IcalIngestor.new - - assert_no_difference('Event.count') do - ingestor.read(source.url) - ingestor.write(@user, @content_provider) - end - - assert ingestor.events.empty? - assert ingestor.materials.empty? - assert_includes ingestor.messages[0], 'Extract from sitemap[https://missing.org/sitemap.xml] failed with:' + test 'should read indico link event' do + @ingestor.read('https://indico.cern.ch/event/1617123/') + @ingestor.write(@user, @content_provider) + + sample = @ingestor.events.detect { |e| e.title == '14th HEP C++ Course and Hands-on Training - The Essentials' } + assert sample.persisted? + + assert_equal sample.url, 'https://indico.cern.ch/event/1617123/' + assert_equal sample.title, '14th HEP C++ Course and Hands-on Training - The Essentials' + assert_equal sample.description, 'speakers and zoom here' + assert_equal sample.keywords, %w[TRAINING EDUCATION] + assert_equal sample.contact, 'name.surname@test.com' + assert_equal sample.start, '2026-03-09 08:00:00 +0000' + assert_equal sample.end, '2026-03-13 16:15:00 +0000' + assert_equal sample.timezone, 'UTC' + assert_equal sample.venue, 'CERN' + assert_match sample.presence, 'hybrid' end - test 'ingest valid sitemap' do - source = @content_provider.sources.build(url: 'https://app.com/events/sitemap.xml', - method: 'ical', - enabled: true) - ingestor = Ingestors::IcalIngestor.new - - # check two events to be updated - name = 'ical_event_1' - event = events(:ical_event_1) - refute event.nil?, "event[#{name}] not found" - refute event.online?, "event[#{name}] online not matched" - assert_equal 'Another Portal Provider', event.content_provider.title, - "event[#{name}] content provider not matched" - - name = 'ical_event_2' - refute events(name).nil?, "fixture[#{name}] not found" - title = 'PaCER Seminar: Computational Fluid Dynamics' - url = 'https://pawsey.org.au/event/pacer-seminar-computational-fluid-dynamics/' - event = check_event_exists title, url - refute event.nil?, "event title[#{title}] not found" - refute event.online?, "event title[#{title}] online not matched" - assert_equal 'Another Portal Provider', event.content_provider.title, - "event title[#{title}] content provider not matched" - - assert_difference('Event.count', 4) do - freeze_time(2019) do - ingestor.read(source.url) - ingestor.write(@user, @content_provider) - end - end + test 'should read indico link category' do + @ingestor.read('https://indico.cern.ch/category/11733/') + @ingestor.write(@user, @content_provider) - assert_equal 8, ingestor.events.count - assert ingestor.materials.empty? - assert_equal 4, ingestor.stats[:events][:added] - assert_equal 2, ingestor.stats[:events][:updated] - assert_equal 2, ingestor.stats[:events][:rejected] - - # check individual events - # check not found - assert_includes ingestor.messages, "Process file url\[https://pawsey.org.au/events/\?ical=true\] failed with: 404 " - - # check rejected - event = ingestor.events.detect { |e| e.title == 'NVIDIA cuQuantum Session' } - assert event - assert event.errors.added?(:url, :url, value: '123') - event = ingestor.events.detect { |e| e.title == 'PaCER Seminar: Radio astronomy' } - assert event - assert event.errors.added?(:url, :blank) - - # check added - title = 'Ask Me Anything: Porous media visualisation and LBPM' - event = check_event_exists title, 'https://pawsey.org.au/event/ask-me-anything-porous-media-visualisation-and-lbpm/' - assert event.online?, "event title[#{event.title}] online not matched" - assert (!event.keywords.nil? and event.keywords.size == 2), "event title[#{event.title}] keywords.size not matched" - assert event.keywords.include?('AMA'), "event title[#{event.title}] keyword[AMA] not found" - assert event.keywords.include?('Visualisation'), "event title[#{event.title}] keyword[Visualisation] not found" - - title = 'Pawsey Intern Showcase 2022' - event = check_event_exists title, 'https://pawsey.org.au/event/pawsey-intern-showcase-2022/' - assert_includes event.description, 'The Pawsey Supercomputing Research Centre takes prides in its Summer Internship Program' - assert_includes event.description, 'range of trainings we immerse students in during Week 1 of the Program (and throughout).' - assert_equal 'Perth', event.timezone.to_s, "event title[#{event.title}] timezone not matched" - assert_equal '2022-02-11 01:45:00 UTC', event.start.utc.to_s, "event title[#{event.title}] start not matched" - assert_equal '2022-02-11 04:50:00 UTC', event.end.utc.to_s, "event title[#{event.title}] end not matched" - - title = 'P\'Con - Experience with porting and scaling codes on AMD GPUs' - event = check_event_exists title, 'https://pawsey.org.au/event/experience-with-porting-and-scaling-codes-on-amd-gpus/' - assert event.online?, "event title[#{title}] online not matched" - - title = 'Overview of High Performance Computing Resources at OLCF' - event = check_event_exists title, 'https://pawsey.org.au/event/overview-of-high-performance-computing-resources-at-olcf/' - refute event.online?, "event title[#{title}] online not matched" - location = 'Pawsey Supercomputing Centre, 1 Bryce Avenue, Kensington, Western Australia, 6151, Australia' - assert_equal location, event.venue, "event title[#{title}] venue not matched" - # Geocoding is disabled so these fail TODO: Re-enable, but using cache + rate limiting - # assert_equal 'Kensington', event.city, "event title[#{title}] city not matched" - # assert_equal '6151', event.postcode, "event title[#{title}] postcode not matched" - # assert_equal 'Australia', event.country, "event title[#{title}] country not matched" - - # TODO: check updated - title = 'PaCER Seminar: Computational Fluid Dynamics' - event = check_event_exists title, 'https://pawsey.org.au/event/pacer-seminar-computational-fluid-dynamics/' - assert_equal '2022-06-15 03:00:00 UTC', event.end.utc.to_s, "event title[#{event.title}] updated end not matched" - assert event.description != 'MyText', "event title[#{event.title}] description not updated" - assert event.description.size > 100, "event title[#{event.title}] description too short" - assert event.online?, "event title[#{event.title}] online not matched" - assert_equal 2, event.keywords.size, "event title[#{event.title}] keywords size not matched" - %w[Supercomputing Seminar].each do |keyword| - assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found" - end - assert_equal 'Online, Virtual, Australia', event.venue, "event title[#{event.title}] venue not matched" - assert event.city.nil?, "event title[#{event.title}] city not matched" - assert event.postcode.nil?, "event title[#{event.title}] postcode not matched" - assert event.country.nil?, "event title[#{event.title}] country not matched" - - title = "P'Con - Embracing new solutions for in-situ visualisation" - event = check_event_exists title, 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/' - assert event.online?, "event title[#{event.title}] online not matched" - assert_equal 3, event.keywords.size, "event title[#{event.title}] keywords size not matched" - %w[Supercomputing Conference Visualisation].each do |keyword| - assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found" - end - assert_equal 'Online, Virtual, Australia', event.venue, "event title[#{event.title}] venue not matched" - assert event.postcode.nil?, "event title[#{event.title}] postcode not matched" - assert event.city.nil?, "event title[#{event.title}] city not matched" - assert event.country.nil?, "event title[#{event.title}] country not matched" - end + sample = @ingestor.events.detect { |e| e.title == '14th HEP C++ Course and Hands-on Training - The Essentials' } + sample2 = @ingestor.events.detect { |e| e.title == 'HEP C++ Course and Hands-on Training - Stay Informed' } + assert sample.persisted? + assert sample2.persisted? - test 'check single ical sources' do - # override time - assert_no_difference 'Event.count' do - freeze_time(2019) do - ingestor = Ingestors::IcalIngestor.new - source = @content_provider.sources.build( - url: 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/?ical=true', - method: 'ical', enabled: true - ) - - ingestor.read(source.url) - ingestor.write(@user, @content_provider) - - assert_equal 1, ingestor.events.count - assert ingestor.materials.empty? - assert_equal 0, ingestor.stats[:events][:added] - assert_equal 1, ingestor.stats[:events][:updated] - assert_equal 0, ingestor.stats[:events][:rejected] - - ingestor = Ingestors::IcalIngestor.new - source = @content_provider.sources.build( - url: 'https://pawsey.org.au/event/pawsey-intern-showcase-2021/?ical=true', - method: 'ical', enabled: true - ) - - ingestor.read(source.url) - ingestor.write(@user, @content_provider) - - assert_equal 1, ingestor.events.count - assert ingestor.materials.empty? - assert_equal 0, ingestor.stats[:events][:added] - assert_equal 0, ingestor.stats[:events][:updated] - assert_equal 1, ingestor.stats[:events][:rejected] - - event = ingestor.events.detect { |e| e.title == 'Pawsey Intern Showcase 2021' } - assert event - assert event.errors.added?(:url, :blank) - end - end - - # get updated - title = 'P\'Con - Embracing new solutions for in-situ visualisation' - url = 'https://pawsey.org.au/event/pcon-embracing-new-solutions-for-in-situ-visualisation/' - event = check_event_exists title, url - assert_equal 3, event.keywords.size - %w[Supercomputing Conference Visualisation].each do |keyword| - assert event.keywords.include?(keyword), "event title[#{event.title}] keyword[#{keyword}] not found" - end + assert_equal sample.url, 'https://indico.cern.ch/event/1617123/' + assert_equal sample2.url, 'https://indico.cern.ch/event/1211412/' end - test 'process_calevent logs error when exception is raised' do - ingestor = Ingestors::IcalIngestor.new - calevent = Object.new # fake calevent + test 'should read non-indico link event' do + @ingestor.read('https://myagenda.com/event/1617123/') + @ingestor.write(@user, @content_provider) - # Stub a method that will raise an error - ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do - ingestor.send(:process_calevent, calevent) - end + sample = @ingestor.events.detect { |e| e.title == '14th HEP C++ Course and Hands-on Training - The Essentials' } + assert sample.persisted? - assert_includes ingestor.messages.last, 'Process iCalendar failed with: test failure' + assert_equal sample.url, 'https://indico.cern.ch/event/1617123/' end - test 'to_export method' do - ingestor = Ingestors::IcalIngestor.new + test 'should convert url properly' do indico_url_event = 'https://indico.cern.ch/event/1588342/' indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone' @@ -209,40 +71,48 @@ class IndicoIngestorTest < ActiveSupport::TestCase indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d' indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d' - url_with_ics = 'https://mywebsite.com/event/blabla/events.ics' - url_with_query_with_ics = 'https://mywebsite.com/event/blabla/events.ics?john=doe&isstub=born' - url_no_ical = 'https://mywebsite.com/event/blabla' - url_with_ical = 'https://mywebsite.com/event/blabla?ical=true' + url_with_ics = 'https://mywebsite.com/event/blabla/event.ics' + url_with_query_with_ics = 'https://mywebsite.com/event/blabla/event.ics?john=doe&isstub=born' # When indico link – event - assert_equal ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds ics - assert_equal ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds ics + assert_equal @ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds event.ics + assert_equal @ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds event.ics # When indico link – category - assert_equal ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds ics - assert_equal ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds ics + assert_equal @ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds events.ics (with an s) + assert_equal @ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds events.ics (with an s) - # When non-indico link - assert_equal ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same - assert_equal ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same + # When non-indico but ics link + assert_equal @ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same + assert_equal @ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same # When indico link which already has the /events.ics - assert_equal ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is - assert_equal ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is + assert_equal @ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is + assert_equal @ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is + end - # When other url, adds the ical query param - assert_equal ingestor.send(:to_export, url_no_ical), url_with_ical + test 'should test std err' do + @ingestor.stub :open_url, ->(_url, *) { raise StandardError, 'test failure' } do + @ingestor.send(:process_url, 'https://indico.cern.ch/event/1617123/') - # When other url with ical query param, keep it as-is - assert_equal ingestor.send(:to_export, url_with_ical), url_with_ical + assert_equal 'Process file url[https://indico.cern.ch/event/1617123/event.ics] failed with: test failure', @ingestor.messages.first + end + + @ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do + mock_logger = Minitest::Mock.new + mock_logger.expect(:error, nil, ['StandardError: test failure']) + + Rails.stub(:logger, mock_logger) do + @ingestor.send(:process_calevent, Icalendar::Event.new) + end + mock_logger.verify + end end private - def check_event_exists(title, url) - events = Event.where(title: title, url: url) - assert (!events.nil? and events.size > 0), "event title[#{title}] not found" - assert events.size < 2, "event[#{title}] duplicates found = #{events.size}" - events.first + def webmock(url, filename) + file = Rails.root.join('test', 'fixtures', 'files', 'ingestion', filename) + WebMock.stub_request(:get, url).to_return(status: 200, headers: {}, body: file.read) end end From 6290e703bc42c7d40e0e173085e9e29b9ea2a48a Mon Sep 17 00:00:00 2001 From: kennethrioja <59597207+kennethrioja@users.noreply.github.com> Date: Wed, 17 Dec 2025 11:02:26 +0100 Subject: [PATCH 10/10] chore(sitemap helpers): rename get_sources to parse_sitemap --- lib/ingestors/concerns/sitemap_helpers.rb | 2 +- lib/ingestors/indico_ingestor.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/ingestors/concerns/sitemap_helpers.rb b/lib/ingestors/concerns/sitemap_helpers.rb index c6e1b251d..2f27a82f7 100644 --- a/lib/ingestors/concerns/sitemap_helpers.rb +++ b/lib/ingestors/concerns/sitemap_helpers.rb @@ -8,7 +8,7 @@ module SitemapHelpers # Reads either a sitemap.{xml|txt} or a single URL # Returns a list of URLs from 1 to n URLs - def get_sources(source_url) + def parse_sitemap(source_url) case source_url.downcase when /sitemap(.*)?\.xml\Z/ parse_xml_sitemap(source_url) diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb index f826fd744..475eb595a 100644 --- a/lib/ingestors/indico_ingestor.rb +++ b/lib/ingestors/indico_ingestor.rb @@ -21,7 +21,7 @@ def self.config def read(source_url) @token = Rails.application.config.secrets.indico_api_token @verbose = false - sources = get_sources(source_url) + sources = parse_sitemap(source_url) return if sources.nil? sources.each do |url|