Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion app/controllers/events_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ def event_params
:timezone, :content_provider_id, { collection_ids: [] }, { node_ids: [] },
{ node_names: [] }, { target_audience: [] }, { eligibility: [] }, :visible,
{ host_institutions: [] }, :capacity, :contact, :recognition, :learning_objectives,
:prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language,
:prerequisites, :tech_requirements, :cost_basis, :cost_value, :cost_currency, :language, :presence,
external_resources_attributes: %i[id url title _destroy],
external_resources: %i[url title], material_ids: [],
llm_interaction_attributes: %i[id scrape_or_process model prompt input output needs_processing _destroy],
Expand Down
1 change: 1 addition & 0 deletions config/secrets.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ external_api_keys: &external_api_keys
password:
gpt_api_key:
willma_api_key:
indico_api_token: # begins by 'indp_', cf. https://docs.getindico.io/en/stable/http-api/access/#api-token-authentication
orcid:
client_id:
secret:
Expand Down
48 changes: 48 additions & 0 deletions lib/ingestors/concerns/sitemap_helpers.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# frozen_string_literal: true

module Ingestors
module Concerns
# From a sitemap.{xml|txt} or a single URL, get the list of URLs (= sources)
module SitemapHelpers
private

# Reads either a sitemap.{xml|txt} or a single URL
# Returns a list of URLs from 1 to n URLs
def parse_sitemap(source_url)
case source_url.downcase
when /sitemap(.*)?\.xml\Z/
parse_xml_sitemap(source_url)
when /sitemap(.*)?\.txt\Z/
parse_txt_sitemap(source_url)
else
[source_url]
end
end

def parse_xml_sitemap(url)
urls = SitemapParser.new(
url,
recurse: true,
headers: { 'User-Agent' => config[:user_agent] }
).to_a.uniq.map(&:strip)

log_sitemap('xml', url, urls.count)
urls
rescue StandardError => e
@messages << "Extract from sitemap[#{url}] failed with: #{e.message}"
nil
end

def parse_txt_sitemap(url)
urls = open_url(url).to_a.uniq.map(&:strip)

log_sitemap('txt', url, urls.count)
urls
end

def log_sitemap(ext, url, count)
@messages << "Parsing .#{ext} sitemap: #{url}\n - #{count} URLs found"
end
end
end
end
141 changes: 141 additions & 0 deletions lib/ingestors/indico_ingestor.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# frozen_string_literal: true

require 'icalendar'
require 'nokogiri'
require 'open-uri'
require 'tzinfo'

module Ingestors
# Reads from direct .ics or Indico (event or category) URLs, .xml sitemaps, and .txt sitemaps.
class IndicoIngestor < Ingestor
include Ingestors::Concerns::SitemapHelpers

def self.config
{
key: 'indico',
title: 'Indico / .ics file',
category: :events
}
end

def read(source_url)
@token = Rails.application.config.secrets.indico_api_token
@verbose = false
sources = parse_sitemap(source_url)
return if sources.nil?

sources.each do |url|
process_url(url)
end
end

private

# Modifies the given URL to the ics export.
# Loops into each event to process it.
# Note: One .ics file can have multiple events.
def process_url(url)
export_url = to_export(url)
raise 'Not an indico link' if export_url.nil?

content = open_url(export_url, raise: true, token: @token).set_encoding('utf-8')
events = Icalendar::Event.parse(content)
raise 'Not found' if events.nil? || events.empty?

events.each do |e|
process_calevent(e)
end
rescue StandardError => e
@messages << "Process file url[#{export_url}] failed with: #{e.message}"
end

# 1. If the path already ends with '/events.ics', return as-is.
# 2. If the host includes 'indico', ensures the path ends with '/events.ics'.
# This method never mutates the original URL string.
# Returns the updated URL string or nil if input is blank.
def to_export(url)
return nil if url.blank?

uri = URI.parse(url)
path = uri.path.to_s

if path.match?(%r{/(event|events)\.ics\z})
uri.to_s
elsif indico_page?(uri)
ensure_events_ics_path(uri)
end
end

def indico_page?(uri)
# Either checks in host, e.g., 'indico.myinstitution.com'
return true if uri.host&.include?('indico')

# Or checks in meta tags
html = open_url(uri, raise: true)
doc = Nokogiri::HTML(html)
content = doc.at('meta[property="og:site_name"]')&.[]('content')
content&.match?(/indico/i)
end

# Ensures the Indico URL ends with '/events.ics'
def ensure_events_ics_path(uri)
paths = uri.path.split('/')
uri.path = "#{paths[0..2].join('/')}/"
if paths[1] == 'event'
uri.path = File.join(uri.path, 'event.ics')
elsif paths[1] == 'category'
uri.path = File.join(uri.path, 'events.ics')
end
uri.to_s
end

# Builds the OpenStruct event and adds it in event.
def process_calevent(calevent)
event_to_add = OpenStruct.new.tap do |event|
assign_basic_info(event, calevent)
assign_time_info(event, calevent)
assign_location_info(event, calevent)
end
add_event(event_to_add)
rescue StandardError => e
Rails.logger.error("#{e.class}: #{e.message}")
end

# Assigns to event: url, title, description, keywords.
def assign_basic_info(event, calevent)
event.url = calevent.url.to_s
event.title = calevent.summary.to_s
event.description = calevent.description.to_s
event.keywords = calevent.categories.flatten
event.contact = calevent.contact.join(', ')
end

# Assigns to event: start, end, timezone.
def assign_time_info(event, calevent)
event.start = calevent.dtstart&.to_time unless calevent.dtstart.nil?
event.end = calevent.dtend&.to_time unless calevent.dtend.nil?
event.timezone = get_tzid(calevent.dtstart)
end

# Assigns to event: venue, online, city.
def assign_location_info(event, calevent)
location = calevent.location
return if location.blank?

event.venue = location.to_s
event.online = calevent.description.include?('zoom')
event.presence = calevent.description.include?('zoom') ? :hybrid : :onsite # can do best, but sufficient for now
end

# Extracts the timezone identifier (TZID) from an iCalendar event's dtstart field.
# Handles whether tzid shows up as an Array or a single string
def get_tzid(dtstart)
return nil unless dtstart.respond_to?(:ical_params)

tzid = dtstart.ical_params['tzid']
return nil if tzid.nil?

tzid.is_a?(Array) ? tzid.first.to_s : tzid.to_s
end
end
end
3 changes: 2 additions & 1 deletion lib/ingestors/ingestor.rb
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,14 @@ def stats_summary(type)
summary
end

def open_url(url, raise: false)
def open_url(url, raise: false, token: nil)
options = {
redirect: false, # We're doing redirects manually below, since open-uri can't handle http -> https redirection
read_timeout: 5
}
options[:ssl_verify_mode] = config[:ssl_verify_mode] if config.key?(:ssl_verify_mode)
redirect_attempts = 5
options['Authorization'] = "Bearer #{token}" unless token.nil?
begin
URI(url).open(options)
rescue OpenURI::HTTPRedirect => e
Expand Down
1 change: 1 addition & 0 deletions lib/ingestors/ingestor_factory.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ def self.ingestors
Ingestors::EventbriteIngestor,
Ingestors::EventCsvIngestor,
Ingestors::IcalIngestor,
Ingestors::IndicoIngestor,
Ingestors::LibcalIngestor,
Ingestors::MaterialCsvIngestor,
Ingestors::TessEventIngestor,
Expand Down
16 changes: 16 additions & 0 deletions test/fixtures/files/ingestion/indico/event.ics
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//CERN//INDICO//EN
BEGIN:VEVENT
SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials
DTSTART:20260309T080000Z
DTEND:20260313T161500Z
DTSTAMP:20251204T105300Z
UID:indico-event-1617123@indico.cern.ch
CONTACT:name.surname@test.com
DESCRIPTION:speakers and zoom here
LOCATION:CERN
URL:https://indico.cern.ch/event/1617123/
CATEGORIES:TRAINING,EDUCATION
END:VEVENT
END:VCALENDAR
25 changes: 25 additions & 0 deletions test/fixtures/files/ingestion/indico/events.ics
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//CERN//INDICO//EN
BEGIN:VEVENT
SUMMARY:14th HEP C++ Course and Hands-on Training - The Essentials
DTSTART:20260309T080000Z
DTEND:20260313T161500Z
DTSTAMP:20251203T150800Z
UID:indico-event-1617123@indico.cern.ch
CONTACT:name.surname@test.com
DESCRIPTION:speakers and zoom here
LOCATION:CERN
URL:https://indico.cern.ch/event/1617123/
END:VEVENT
BEGIN:VEVENT
SUMMARY:HEP C++ Course and Hands-on Training - Stay Informed
DTSTART:20991231T225800Z
DTEND:20991231T225900Z
DTSTAMP:20251203T150800Z
UID:indico-event-1211412@indico.cern.ch
CONTACT:name.surname@test.com
DESCRIPTION:mockdescription
URL:https://indico.cern.ch/event/1211412/
END:VEVENT
END:VCALENDAR
16 changes: 16 additions & 0 deletions test/fixtures/files/ingestion/indico/indico.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!DOCTYPE html>

<html lang="en"
data-canonical-locale="en-US"
data-static-site="false">
<head>
<title>My Agenda (Indico)</title>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<link rel="shortcut icon" type="image/x-icon" href="/images/indico.ico">

<meta property="og:site_name" content="Agenda (Indico)">
<meta property="og:image" content="https://agenda.com/images/indico_square.png">

</head>
</html>
Loading
Loading