diff --git a/app/controllers/events_controller.rb b/app/controllers/events_controller.rb index 3060ccf97..cf1a0e811 100644 --- a/app/controllers/events_controller.rb +++ b/app/controllers/events_controller.rb @@ -237,7 +237,7 @@ def event_params :timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] }, { node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible, { host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives, - :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, + :prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, :presence, external_resources_attributes: %i[id url title _destroy], external_resources: %i[url title], material_ids: [], llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy], diff --git a/config/secrets.example.yml b/config/secrets.example.yml index 51f0fa8f1..d2161c560 100644 --- a/config/secrets.example.yml +++ b/config/secrets.example.yml @@ -38,6 +38,7 @@ external_api_keys: &external_api_keys password: gpt_api_key: willma_api_key: + indico_api_token: # begins by 'indp_', cf. https://docs.getindico.io/en/stable/http-api/access/#api-token-authentication orcid: client_id: secret: diff --git a/lib/ingestors/concerns/sitemap_helpers.rb b/lib/ingestors/concerns/sitemap_helpers.rb new file mode 100644 index 000000000..2f27a82f7 --- /dev/null +++ b/lib/ingestors/concerns/sitemap_helpers.rb @@ -0,0 +1,48 @@ +# frozen_string_literal: true + +module Ingestors + module Concerns + # From a sitemap.{xml|txt} or a single URL, get the list of URLs (= sources) + module SitemapHelpers + private + + # Reads either a sitemap.{xml|txt} or a single URL + # Returns a list of URLs from 1 to n URLs + def parse_sitemap(source_url) + case source_url.downcase + when /sitemap(.*)?\.xml\Z/ + parse_xml_sitemap(source_url) + when /sitemap(.*)?\.txt\Z/ + parse_txt_sitemap(source_url) + else + [source_url] + end + end + + def parse_xml_sitemap(url) + urls = SitemapParser.new( + url, + recurse: true, + headers: { 'User-Agent' => config[:user_agent] } + ).to_a.uniq.map(&:strip) + + log_sitemap('xml', url, urls.count) + urls + rescue StandardError => e + @messages << "Extract from sitemap[#{url}] failed with: #{e.message}" + nil + end + + def parse_txt_sitemap(url) + urls = open_url(url).to_a.uniq.map(&:strip) + + log_sitemap('txt', url, urls.count) + urls + end + + def log_sitemap(ext, url, count) + @messages << "Parsing .#{ext} sitemap: #{url}\n - #{count} URLs found" + end + end + end +end diff --git a/lib/ingestors/indico_ingestor.rb b/lib/ingestors/indico_ingestor.rb new file mode 100644 index 000000000..475eb595a --- /dev/null +++ b/lib/ingestors/indico_ingestor.rb @@ -0,0 +1,141 @@ +# frozen_string_literal: true + +require 'icalendar' +require 'nokogiri' +require 'open-uri' +require 'tzinfo' + +module Ingestors + # Reads from direct .ics or Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps. + class IndicoIngestor < Ingestor + include Ingestors::Concerns::SitemapHelpers + + def self.config + { + key: 'indico', + title: 'Indico / .ics file', + category: :events + } + end + + def read(source_url) + @token = Rails.application.config.secrets.indico_api_token + @verbose = false + sources = parse_sitemap(source_url) + return if sources.nil? + + sources.each do |url| + process_url(url) + end + end + + private + + # Modifies the given URL to the ics export. + # Loops into each event to process it. + # Note: One .ics file can have multiple events. + def process_url(url) + export_url = to_export(url) + raise 'Not an indico link' if export_url.nil? + + content = open_url(export_url, raise: true, token: @token).set_encoding('utf-8') + events = Icalendar::Event.parse(content) + raise 'Not found' if events.nil? || events.empty? + + events.each do |e| + process_calevent(e) + end + rescue StandardError => e + @messages << "Process file url[#{export_url}] failed with: #{e.message}" + end + + # 1. If the path already ends with '/events.ics', return as-is. + # 2. If the host includes 'indico', ensures the path ends with '/events.ics'. + # This method never mutates the original URL string. + # Returns the updated URL string or nil if input is blank. + def to_export(url) + return nil if url.blank? + + uri = URI.parse(url) + path = uri.path.to_s + + if path.match?(%r{/(event|events)\.ics\z}) + uri.to_s + elsif indico_page?(uri) + ensure_events_ics_path(uri) + end + end + + def indico_page?(uri) + # Either checks in host, e.g., 'indico.myinstitution.com' + return true if uri.host&.include?('indico') + + # Or checks in meta tags + html = open_url(uri, raise: true) + doc = Nokogiri::HTML(html) + content = doc.at('meta[property="og:site_name"]')&.[]('content') + content&.match?(/indico/i) + end + + # Ensures the Indico URL ends with '/events.ics' + def ensure_events_ics_path(uri) + paths = uri.path.split('/') + uri.path = "#{paths[0..2].join('/')}/" + if paths[1] == 'event' + uri.path = File.join(uri.path, 'event.ics') + elsif paths[1] == 'category' + uri.path = File.join(uri.path, 'events.ics') + end + uri.to_s + end + + # Builds the OpenStruct event and adds it in event. + def process_calevent(calevent) + event_to_add = OpenStruct.new.tap do |event| + assign_basic_info(event, calevent) + assign_time_info(event, calevent) + assign_location_info(event, calevent) + end + add_event(event_to_add) + rescue StandardError => e + Rails.logger.error("#{e.class}: #{e.message}") + end + + # Assigns to event: url, title, description, keywords. + def assign_basic_info(event, calevent) + event.url = calevent.url.to_s + event.title = calevent.summary.to_s + event.description = calevent.description.to_s + event.keywords = calevent.categories.flatten + event.contact = calevent.contact.join(', ') + end + + # Assigns to event: start, end, timezone. + def assign_time_info(event, calevent) + event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil? + event.end = calevent.dtend&.to_time unless calevent.dtend.nil? + event.timezone = get_tzid(calevent.dtstart) + end + + # Assigns to event: venue, online, city. + def assign_location_info(event, calevent) + location = calevent.location + return if location.blank? + + event.venue = location.to_s + event.online = calevent.description.include?('zoom') + event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now + end + + # Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field. + # Handles whether tzid shows up as an Array or a single string + def get_tzid(dtstart) + return nil unless dtstart.respond_to?(:ical_params) + + tzid = dtstart.ical_params['tzid'] + return nil if tzid.nil? + + tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s + end + end +end diff --git a/lib/ingestors/ingestor.rb b/lib/ingestors/ingestor.rb index 8f043f135..a662985cf 100644 --- a/lib/ingestors/ingestor.rb +++ b/lib/ingestors/ingestor.rb @@ -50,13 +50,14 @@ def stats_summary(type) summary end - def open_url(url, raise: false) + def open_url(url, raise: false, token: nil) options = { redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection read_timeout: 5 } options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode) redirect_attempts = 5 + options['Authorization'] = "Bearer #{token}" unless token.nil? begin URI(url).open(options) rescue OpenURI::HTTPRedirect => e diff --git a/lib/ingestors/ingestor_factory.rb b/lib/ingestors/ingestor_factory.rb index fe1a7bd3a..1da61a790 100644 --- a/lib/ingestors/ingestor_factory.rb +++ b/lib/ingestors/ingestor_factory.rb @@ -6,6 +6,7 @@ def self.ingestors Ingestors::EventbriteIngestor, Ingestors::EventCsvIngestor, Ingestors::IcalIngestor, + Ingestors::IndicoIngestor, Ingestors::LibcalIngestor, Ingestors::MaterialCsvIngestor, Ingestors::TessEventIngestor, diff --git a/test/fixtures/files/ingestion/indico/event.ics b/test/fixtures/files/ingestion/indico/event.ics new file mode 100644 index 000000000..58728eadf --- /dev/null +++ b/test/fixtures/files/ingestion/indico/event.ics @@ -0,0 +1,16 @@ +BEGIN:VCALENDAR +VERSION:2.0 +PRODID:-//CERN//INDICO//EN +BEGIN:VEVENT +SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials +DTSTART:20260309T080000Z +DTEND:20260313T161500Z +DTSTAMP:20251204T105300Z +UID:indico-event-1617123@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:speakers and zoom here +LOCATION:CERN +URL:https://indico.cern.ch/event/1617123/ +CATEGORIES:TRAINING,EDUCATION +END:VEVENT +END:VCALENDAR \ No newline at end of file diff --git a/test/fixtures/files/ingestion/indico/events.ics b/test/fixtures/files/ingestion/indico/events.ics new file mode 100644 index 000000000..7f3bbfd22 --- /dev/null +++ b/test/fixtures/files/ingestion/indico/events.ics @@ -0,0 +1,25 @@ +BEGIN:VCALENDAR +VERSION:2.0 +PRODID:-//CERN//INDICO//EN +BEGIN:VEVENT +SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials +DTSTART:20260309T080000Z +DTEND:20260313T161500Z +DTSTAMP:20251203T150800Z +UID:indico-event-1617123@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:speakers and zoom here +LOCATION:CERN +URL:https://indico.cern.ch/event/1617123/ +END:VEVENT +BEGIN:VEVENT +SUMMARY:HEP C++ Course and Hands-on Training - Stay Informed +DTSTART:20991231T225800Z +DTEND:20991231T225900Z +DTSTAMP:20251203T150800Z +UID:indico-event-1211412@indico.cern.ch +CONTACT:name.surname@test.com +DESCRIPTION:mockdescription +URL:https://indico.cern.ch/event/1211412/ +END:VEVENT +END:VCALENDAR \ No newline at end of file diff --git a/test/fixtures/files/ingestion/indico/indico.html b/test/fixtures/files/ingestion/indico/indico.html new file mode 100644 index 000000000..33765bb38 --- /dev/null +++ b/test/fixtures/files/ingestion/indico/indico.html @@ -0,0 +1,16 @@ + + + + + My Agenda (Indico) + + + + + + + + + \ No newline at end of file diff --git a/test/unit/ingestors/indico_ingestor_test.rb b/test/unit/ingestors/indico_ingestor_test.rb new file mode 100644 index 000000000..7d9183df7 --- /dev/null +++ b/test/unit/ingestors/indico_ingestor_test.rb @@ -0,0 +1,118 @@ +require 'test_helper' + +class IndicoIngestorTest < ActiveSupport::TestCase + setup do + @ingestor = Ingestors::IndicoIngestor.new + @user = users(:regular_user) + @content_provider = content_providers(:another_portal_provider) + # mock_ingestions + # mock_timezone # System time zone should not affect test result + + webmock('https://indico.cern.ch/event/1617123/', 'indico/indico.html') + webmock('https://indico.cern.ch/event/1617123/event.ics', 'indico/event.ics') + webmock('https://indico.cern.ch/category/11733/', 'indico/indico.html') + webmock('https://indico.cern.ch/category/11733/events.ics', 'indico/events.ics') + webmock('https://myagenda.com/event/1617123/', 'indico/indico.html') + webmock('https://myagenda.com/event/1617123/event.ics', 'indico/event.ics') + end + + teardown do + reset_timezone + end + + test 'should read indico link event' do + @ingestor.read('https://indico.cern.ch/event/1617123/') + @ingestor.write(@user, @content_provider) + + sample = @ingestor.events.detect { |e| e.title == '14th HEP C++ Course and Hands-on Training - The Essentials' } + assert sample.persisted? + + assert_equal sample.url, 'https://indico.cern.ch/event/1617123/' + assert_equal sample.title, '14th HEP C++ Course and Hands-on Training - The Essentials' + assert_equal sample.description, 'speakers and zoom here' + assert_equal sample.keywords, %w[TRAINING EDUCATION] + assert_equal sample.contact, 'name.surname@test.com' + assert_equal sample.start, '2026-03-09 08:00:00 +0000' + assert_equal sample.end, '2026-03-13 16:15:00 +0000' + assert_equal sample.timezone, 'UTC' + assert_equal sample.venue, 'CERN' + assert_match sample.presence, 'hybrid' + end + + test 'should read indico link category' do + @ingestor.read('https://indico.cern.ch/category/11733/') + @ingestor.write(@user, @content_provider) + + sample = @ingestor.events.detect { |e| e.title == '14th HEP C++ Course and Hands-on Training - The Essentials' } + sample2 = @ingestor.events.detect { |e| e.title == 'HEP C++ Course and Hands-on Training - Stay Informed' } + assert sample.persisted? + assert sample2.persisted? + + assert_equal sample.url, 'https://indico.cern.ch/event/1617123/' + assert_equal sample2.url, 'https://indico.cern.ch/event/1211412/' + end + + test 'should read non-indico link event' do + @ingestor.read('https://myagenda.com/event/1617123/') + @ingestor.write(@user, @content_provider) + + sample = @ingestor.events.detect { |e| e.title == '14th HEP C++ Course and Hands-on Training - The Essentials' } + assert sample.persisted? + + assert_equal sample.url, 'https://indico.cern.ch/event/1617123/' + end + + test 'should convert url properly' do + indico_url_event = 'https://indico.cern.ch/event/1588342/' + indico_url_event_with_ics = 'https://indico.cern.ch/event/1588342/event.ics' # ! when '/event', event.ics is singular + indico_url_event_with_query = 'https://indico.cern.ch/event/1588342/?somerandom=urlparams&an=otherone' + indico_url_event_with_query_with_ics = 'https://indico.cern.ch/event/1588342/event.ics?somerandom=urlparams&an=otherone' + indico_url_category = 'https://indico.cern.ch/category/19377/' + indico_url_category_with_ics = 'https://indico.cern.ch/category/19377/events.ics' # ! when '/category', eventS.ics is plural + indico_url_category_with_query = 'https://indico.cern.ch/category/19377/?a=b&c=d' + indico_url_category_with_query_with_ics = 'https://indico.cern.ch/category/19377/events.ics?a=b&c=d' + url_with_ics = 'https://mywebsite.com/event/blabla/event.ics' + url_with_query_with_ics = 'https://mywebsite.com/event/blabla/event.ics?john=doe&isstub=born' + + # When indico link – event + assert_equal @ingestor.send(:to_export, indico_url_event), indico_url_event_with_ics # adds event.ics + assert_equal @ingestor.send(:to_export, indico_url_event_with_query), indico_url_event_with_query_with_ics # adds event.ics + + # When indico link – category + assert_equal @ingestor.send(:to_export, indico_url_category), indico_url_category_with_ics # adds events.ics (with an s) + assert_equal @ingestor.send(:to_export, indico_url_category_with_query), indico_url_category_with_query_with_ics # adds events.ics (with an s) + + # When non-indico but ics link + assert_equal @ingestor.send(:to_export, url_with_ics), url_with_ics # keeps same + assert_equal @ingestor.send(:to_export, url_with_query_with_ics), url_with_query_with_ics # keeps same + + # When indico link which already has the /events.ics + assert_equal @ingestor.send(:to_export, indico_url_event_with_ics), indico_url_event_with_ics # keeps it as-is + assert_equal @ingestor.send(:to_export, indico_url_event_with_query_with_ics), indico_url_event_with_query_with_ics # keeps it as-is + end + + test 'should test std err' do + @ingestor.stub :open_url, ->(_url, *) { raise StandardError, 'test failure' } do + @ingestor.send(:process_url, 'https://indico.cern.ch/event/1617123/') + + assert_equal 'Process file url[https://indico.cern.ch/event/1617123/event.ics] failed with: test failure', @ingestor.messages.first + end + + @ingestor.stub(:assign_basic_info, ->(*) { raise StandardError, 'test failure' }) do + mock_logger = Minitest::Mock.new + mock_logger.expect(:error, nil, ['StandardError: test failure']) + + Rails.stub(:logger, mock_logger) do + @ingestor.send(:process_calevent, Icalendar::Event.new) + end + mock_logger.verify + end + end + + private + + def webmock(url, filename) + file = Rails.root.join('test', 'fixtures', 'files', 'ingestion', filename) + WebMock.stub_request(:get, url).to_return(status: 200, headers: {}, body: file.read) + end +end