diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 3060ccf97..cf1a0e811 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -237,7 +237,7 @@ def event_params :timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] }, { node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible, { host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives, - :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, + :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, :presence, external_resources_attributes: %i[id url title _destroy], external_resources: %i[url title], material_ids: [], llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy], diff --git a/config/secrets.example.yml b/config/secrets.example.yml index 51f0fa8f1..d2161c560 100644 --- a/config/secrets.example.yml +++ b/config/secrets.example.yml @@ -38,6 +38,7 @@ external_api_keys: &external_api_keys password: gpt_api_key: willma_api_key: + indico_api_token: # begins by 'indp_', cf. https://docs.getindico.io/en/stable/http-api/access/#api-token-authentication orcid: client_id: secret: diff --git a/lib/ingestors/concerns/sitemap_helpers.rb b/lib/ingestors/concerns/sitemap_helpers.rb new file mode 100644 index 000000000..2f27a82f7 --- /dev/null +++ b/lib/ingestors/concerns/sitemap_helpers.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +module Ingestors + module Concerns + # From a sitemap.{xml|txt} or a single URL, get the list of URLs (= sources) + module SitemapHelpers + private + + # Reads either a sitemap.{xml|txt} or a single URL + # Returns a list of URLs from 1 to n URLs + def parse_sitemap(source_url) + case source_url.downcase + when /sitemap(.*)?\.xml\Z/ + parse_xml_sitemap(source_url) + when /sitemap(.*)?\.txt\Z/ + parse_txt_sitemap(source_url) + else + [source_url] + end + end + + def parse_xml_sitemap(url) + urls = SitemapParser.new( + url, + recurse: true, + headers: { 'User-Agent' => config[:user_agent] } + ).to_a.uniq.map(&:strip) + + log_sitemap('xml', url, urls.count) + urls + rescue StandardError => e + @messages << "Extract from sitemap[#{url}] failed with: #{e.message}" + nil + end + + def parse_txt_sitemap(url) + urls = open_url(url).to_a.uniq.map(&:strip) + + log_sitemap('txt', url, urls.count) + urls + end + + def log_sitemap(ext, url, count) + @messages << "Parsing .#{ext} sitemap: #{url}\n - #{count} URLs found" + end + end + end +end diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb new file mode 100644 index 000000000..475eb595a --- /dev/null +++ b/lib/ingestors/indico_ingestor.rb @@ -0,0 +1,141 @@ +# frozen_string_literal: true + +require 'icalendar' +require 'nokogiri' +require 'open-uri' +require 'tzinfo' + +module Ingestors + # Reads from direct .ics or Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps. + class IndicoIngestor < Ingestor + include Ingestors::Concerns::SitemapHelpers + + def self.config + { + key: 'indico', + title: 'Indico / .ics file', + category: :events + } + end + + def read(source_url) + @token = Rails.application.config.secrets.indico_api_token + @verbose = false + sources = parse_sitemap(source_url) + return if sources.nil? + + sources.each do |url| + process_url(url) + end + end + + private + + # Modifies the given URL to the ics export. + # Loops into each event to process it. + # Note: One .ics file can have multiple events. + def process_url(url) + export_url = to_export(url) + raise 'Not an indico link' if export_url.nil? + + content = open_url(export_url, raise: true, token: @token).set_encoding('utf-8') + events = Icalendar::Event.parse(content) + raise 'Not found' if events.nil? || events.empty? + + events.each do |e| + process_calevent(e) + end + rescue StandardError => e + @messages << "Process file url[#{export_url}] failed with: #{e.message}" + end + + # 1. If the path already ends with '/events.ics', return as-is. + # 2. If the host includes 'indico', ensures the path ends with '/events.ics'. + # This method never mutates the original URL string. + # Returns the updated URL string or nil if input is blank. + def to_export(url) + return nil if url.blank? + + uri = URI.parse(url) + path = uri.path.to_s + + if path.match?(%r{/(event|events)\.ics\z}) + uri.to_s + elsif indico_page?(uri) + ensure_events_ics_path(uri) + end + end + + def indico_page?(uri) + # Either checks in host, e.g., 'indico.myinstitution.com' + return true if uri.host&.include?('indico') + + # Or checks in meta tags + html = open_url(uri, raise: true) + doc = Nokogiri::HTML(html) + content = doc.at('meta[property="og:site_name"]')&.[]('content') + content&.match?(/indico/i) + end + + # Ensures the Indico URL ends with '/events.ics' + def ensure_events_ics_path(uri) + paths = uri.path.split('/') + uri.path = "#{paths[0..2].join('/')}/" + if paths[1] == 'event' + uri.path = File.join(uri.path, 'event.ics') + elsif paths[1] == 'category' + uri.path = File.join(uri.path, 'events.ics') + end + uri.to_s + end + + # Builds the OpenStruct event and adds it in event. + def process_calevent(calevent) + event_to_add = OpenStruct.new.tap do |event| + assign_basic_info(event, calevent) + assign_time_info(event, calevent) + assign_location_info(event, calevent) + end + add_event(event_to_add) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + end + + # Assigns to event: url, title, description, keywords. + def assign_basic_info(event, calevent) + event.url = calevent.url.to_s + event.title = calevent.summary.to_s + event.description = calevent.description.to_s + event.keywords = calevent.categories.flatten + event.contact = calevent.contact.join(', ') + end + + # Assigns to event: start, end, timezone. + def assign_time_info(event, calevent) + event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil? + event.end = calevent.dtend&.to_time unless calevent.dtend.nil? + event.timezone = get_tzid(calevent.dtstart) + end + + # Assigns to event: venue, online, city. + def assign_location_info(event, calevent) + location = calevent.location + return if location.blank? + + event.venue = location.to_s + event.online = calevent.description.include?('zoom') + event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now + end + + # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. + # Handles whether tzid shows up as an Array or a single string + def get_tzid(dtstart) + return nil unless dtstart.respond_to?(:ical_params) + + tzid = dtstart.ical_params['tzid'] + return nil if tzid.nil? + + tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s + end + end +end diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb index 8f043f135..a662985cf 100644 --- a/lib/ingestors/ingestor.rb +++ b/lib/ingestors/ingestor.rb @@ -50,13 +50,14 @@ def stats_summary(type) summary end - def open_url(url, raise: false) + def open_url(url, raise: false, token: nil) options = { redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection read_timeout: 5 } options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode) redirect_attempts = 5 + options['Authorization'] = "Bearer #{token}" unless token.nil? begin URI(url).open(options) rescue OpenURI::HTTPRedirect => e diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index fe1a7bd3a..1da61a790 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -6,6 +6,7 @@ def self.ingestors Ingestors::EventbriteIngestor, Ingestors::EventCsvIngestor, Ingestors::IcalIngestor, + Ingestors::IndicoIngestor, Ingestors::LibcalIngestor, Ingestors::MaterialCsvIngestor, Ingestors::TessEventIngestor, diff --git a/test/fixtures/files/ingestion/indico/event.ics b/test/fixtures/files/ingestion/indico/event.ics new file mode 100644 index 000000000..58728eadf --- /dev/null +++ b/test/fixtures/files/ingestion/indico/event.ics @@ -0,0 +1,16 @@ +BEGIN:VCALENDAR +VERSION:2.0 +PRODID:-//CERN//INDICO//EN +BEGIN:VEVENT +SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials +DTSTART:20260309T080000Z +DTEND:20260313T161500Z +DTSTAMP:20251204T105300Z +UID:indico-event-1617123@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:speakers and zoom here +LOCATION:CERN +URL:https://indico.cern.ch/event/1617123/ +CATEGORIES:TRAINING,EDUCATION +END:VEVENT +END:VCALENDAR \ No newline at end of file diff --git a/test/fixtures/files/ingestion/indico/events.ics b/test/fixtures/files/ingestion/indico/events.ics new file mode 100644 index 000000000..7f3bbfd22 --- /dev/null +++ b/test/fixtures/files/ingestion/indico/events.ics @@ -0,0 +1,25 @@ +BEGIN:VCALENDAR +VERSION:2.0 +PRODID:-//CERN//INDICO//EN +BEGIN:VEVENT +SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials +DTSTART:20260309T080000Z +DTEND:20260313T161500Z +DTSTAMP:20251203T150800Z +UID:indico-event-1617123@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:speakers and zoom here +LOCATION:CERN +URL:https://indico.cern.ch/event/1617123/ +END:VEVENT +BEGIN:VEVENT +SUMMARY:HEP C++ Course and Hands-on Training - Stay Informed +DTSTART:20991231T225800Z +DTEND:20991231T225900Z +DTSTAMP:20251203T150800Z +UID:indico-event-1211412@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:mockdescription +URL:https://indico.cern.ch/event/1211412/ +END:VEVENT +END:VCALENDAR \ No newline at end of file diff --git a/test/fixtures/files/ingestion/indico/indico.html b/test/fixtures/files/ingestion/indico/indico.html new file mode 100644 index 000000000..33765bb38 --- /dev/null +++ b/test/fixtures/files/ingestion/indico/indico.html @@ -0,0 +1,16 @@ + + + +
+