-
Notifications
You must be signed in to change notification settings - Fork 19
Material Ingestor – GitHub #1154
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ec6bbe3
ceff245
9bfbc67
1ceba7f
1c3a0a6
791b4dc
23e04f1
d52a2a0
6c234ab
c7d4b75
7690b6b
18b415f
fe1a4fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,243 @@ | ||
| # frozen_string_literal: true | ||
|
|
||
| require 'open-uri' | ||
| require 'json' | ||
| require 'httparty' | ||
| require 'nokogiri' | ||
|
|
||
| module Ingestors | ||
| # GithubIngestor fetches repository information from GitHub to populate the materials' metadata. | ||
| # API requests counter: | ||
| # 1. Get the repo's general metadata #{GITHUB_API_BASE}/#{full_name} | ||
| # and keys: name, full_name, owner.login, html_url, description, | ||
| # homepage, topics, license.{key, spdx}, archived, | ||
| # created_at, pushed_at, updated_at, contributors_url | ||
| # 2. Get the doi #{GITHUB_API_BASE}/#{full_name}/contents/README.md | ||
| # and key: content | ||
| # 3. Get the version/release #{GITHUB_API_BASE}/#{full_name}/releases | ||
| # and key: tag_name (first) | ||
| # 4. Get the contributors' list #{GITHUB_API_BASE}/#{full_name}/contributors | ||
| # and key: login (from all entries) | ||
| class GithubIngestor < Ingestor # rubocop:disable Metrics/ClassLength | ||
| include Ingestors::Concerns::SitemapHelpers | ||
|
|
||
| GITHUB_API_BASE = 'https://api.github.com/repos' | ||
| CACHE_PREFIX = 'github_ingestor_' | ||
| TTL = 1.week # cache expiration time (time to live before cache expires) | ||
|
|
||
| def self.config | ||
| { | ||
| key: 'github', | ||
| title: 'GitHub Repository or Page', | ||
| category: :materials, | ||
| user_agent: 'TeSS Github ingestor' | ||
| } | ||
| end | ||
|
|
||
| # Reads from direct GitHub URLs, .xml sitemaps, and .txt sitemaps. | ||
| # Fetches repository metadata, contributors, releases, and DOIs (from CITATION.cff). | ||
| # It handles automatically GitHub Pages URLs (github.io) and standard github.com URLs. | ||
| # It caches API requests to avoid repeated calls. | ||
| def read(source_url) | ||
| @verbose = false | ||
| # Returns either a map of unique URL entries, either the URL itself | ||
| sources = parse_sitemap(source_url) | ||
|
|
||
| sources.each do |url| | ||
| # Reads each source, if github.{com|io}, gets the repo's api, if not, next | ||
| repo_api_url = to_github_api(url) | ||
| next unless repo_api_url | ||
|
|
||
| # Gets the cached repo data or reads and sets it | ||
| key = "#{CACHE_PREFIX}#{repo_api_url.gsub(%r{https?://}, '').gsub('/', '_')}" | ||
| repo_data = cache_fetch(key, repo_api_url) | ||
| next unless repo_data | ||
|
|
||
| # Add to material | ||
| add_material to_material(repo_data) | ||
| end | ||
| rescue StandardError => e | ||
| Rails.logger.error("#{e.class}: read() failed, #{e.message}") | ||
| end | ||
|
Comment on lines
+59
to
+61
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These messages can be viewed by users, it's better to make them user-friendly and send the real exception message to the rails log, see: (I know there are other ingestors that do this - they are wrong...) |
||
|
|
||
| private | ||
|
|
||
| # Takes a github.{com|io} url and returns its api.github.com url | ||
| def to_github_api(url) | ||
| uri = URI(url) | ||
| parts = uri.path.split('/') # 'example.com/foo/bar' will have path == '/foo/bar', so three parts | ||
|
|
||
| # http(s)://github.com/<username>/<repo> is the strict way to pass | ||
| if uri.host&.downcase == 'github.com' && (uri.host.count('.') == 1) && parts.size == 3 | ||
| github_api_from_com(parts) | ||
| # http(s)://<username>.github.io/<repo> is the strict way to pass | ||
| elsif uri.host&.downcase&.end_with?('.github.io') && (uri.host.count('.') == 2) && parts.size >= 2 | ||
| github_api_from_io(uri, parts) | ||
| end | ||
| end | ||
|
|
||
| def github_api_from_com(parts) | ||
| "#{GITHUB_API_BASE}/#{parts[1]}/#{parts[2]}" | ||
| end | ||
|
|
||
| def github_api_from_io(uri, parts) | ||
| repo = parts[1] | ||
| owner = uri.host.split('.').first | ||
| "#{GITHUB_API_BASE}/#{owner}/#{repo}" | ||
| end | ||
|
|
||
| # Fetch cached data or opens webpage/api and cache it | ||
| # I chose to cache because GitHub limits up to 60 requests per hour for unauthenticated user | ||
| # https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api?apiVersion=2022-11-28#primary-rate-limit-for-unauthenticated-users | ||
| # One GitHub URL equals to 4 GitHub API requests. | ||
| # key: string key for the cache | ||
| # url: url to open | ||
| # ttl: time-to-live in seconds (default 7 days) | ||
| def cache_fetch(key, url) | ||
| Rails.cache.fetch(key, expires_in: TTL, skip_nil: true) do | ||
| JSON.parse(open_url(url).read) | ||
| end | ||
| end | ||
|
|
||
| # Sets material hash keys and values and add them to material | ||
| def to_material(repo_data) # rubocop:disable Metrics/AbcSize | ||
| github_io_homepage = github_io_homepage? repo_data['homepage'] | ||
| url = github_io_homepage ? repo_data['homepage'] : repo_data['html_url'] | ||
| redirected_url = get_redirected_url(url) | ||
| html = get_html(redirected_url) | ||
|
|
||
| material = OpenStruct.new | ||
| material.title = repo_data['name'].titleize | ||
| material.url = url | ||
| material.description = github_io_homepage ? fetch_definition(html, redirected_url) : repo_data['description'] | ||
| material.keywords = repo_data['topics'] | ||
| material.licence = fetch_licence(repo_data['license']) | ||
| material.status = repo_data['archived'] ? 'Archived' : 'Active' | ||
| material.doi = fetch_doi(repo_data['full_name']) | ||
| material.version = fetch_latest_release(repo_data['full_name']) | ||
| material.date_created = repo_data['created_at'] | ||
| material.date_published = repo_data['pushed_at'] | ||
| material.date_modified = repo_data['updated_at'] | ||
| material.contributors = fetch_contributors(repo_data['contributors_url'], repo_data['full_name']) | ||
| material.resource_type = github_io_homepage ? ['Github Page'] : ['Github Repository'] | ||
| material.prerequisites = fetch_prerequisites(html) | ||
| material | ||
| end | ||
|
|
||
| def github_io_homepage?(homepage) | ||
| return false if homepage.nil? || homepage.empty? | ||
|
|
||
| url = URI(homepage) | ||
| url.host&.downcase&.end_with?('.github.io') | ||
| end | ||
|
|
||
| def get_html(url) | ||
| response = HTTParty.get(url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] }) | ||
| Nokogiri::HTML(response.body) | ||
| end | ||
|
|
||
| # DEFINITION – Opens the GitHub homepage, fetches the 3 first >50 char <p> tags'text | ||
| # and joins them with a 'Read more...' link at the end of the description | ||
| # Some of the first <p> tags were not descriptive, thus skipping them | ||
| def fetch_definition(html, url) | ||
| desc = '' | ||
| round = 3 | ||
| html.css('p').each do |p| | ||
| p_txt = p&.text&.strip&.gsub(/\s+/, ' ') || '' | ||
| next if p_txt.length < 50 || round.zero? | ||
|
|
||
| desc = "#{desc}\n#{p_txt}" | ||
| round -= 1 | ||
| end | ||
| "#{desc}\n(...) [Read more...](#{url})" | ||
| end | ||
|
|
||
| # LICENCE – Get proper licence | ||
| # the licence must match the format of config/dictionaries/licences.yml | ||
| def fetch_licence(licence) | ||
| return 'notspecified' if licence.nil? || licence == 'null' | ||
| return 'other-at' if licence['key'] == 'other' | ||
|
|
||
| licence['spdx_id'] | ||
| end | ||
|
|
||
| # DOI – Fetches DOI from various sources in a repo | ||
| # I chose to only read the `README.md` as it seems to have the DOI badge almost everytime. | ||
| # Whereas enabling the fetching of CITATION.cff or CITATION.md would result in increasing | ||
| # the number of api request. | ||
| def fetch_doi(full_name) | ||
| filename = 'README.md' | ||
| url = "#{GITHUB_API_BASE}/#{full_name}/contents/#{filename}" | ||
| data = cache_fetch("#{CACHE_PREFIX}doi_#{full_name.gsub('/', '_')}_#{filename.downcase}", url) | ||
| return nil unless data && data['content'] | ||
|
|
||
| decoded = Base64.decode64(data['content']) | ||
| doi_match = decoded.match(%r{doi.org/\s*([^\s,)]+)}i) | ||
| doi_match ? "https://doi.org/#{doi_match[1]}" : nil | ||
| end | ||
|
|
||
| # RELEASE – Opens releases API address and returns last release | ||
| def fetch_latest_release(full_name) | ||
| url = "#{GITHUB_API_BASE}/#{full_name}/releases" | ||
| releases = cache_fetch("#{CACHE_PREFIX}releases_#{full_name.gsub('/', '_')}", url) | ||
| releases.is_a?(Array) && releases.first ? releases.first['tag_name'] : nil | ||
| end | ||
|
|
||
| # CONTRIBUTORS – Opens contributors API address and returns list of contributors | ||
| def fetch_contributors(contributors_url, full_name) | ||
| contributors = cache_fetch("#{CACHE_PREFIX}contributors_#{full_name.gsub('/', '_')}", contributors_url) | ||
|
fbacall marked this conversation as resolved.
|
||
| return [] unless contributors | ||
|
|
||
| contributors.map { |c| (c['login']) } | ||
| end | ||
|
|
||
| # PREREQUISITES – From the homepage HTML, looks for <p> tags which are children of ... | ||
| def fetch_prerequisites(html) | ||
| prereq_paragraphs = [] | ||
|
|
||
| # ... any heading tag (h1–h6) or span tag with text "prereq" (EN) or "prerreq" (ES) | ||
| prereq_paragraphs = fetch_prerequisites_from_h(html, prereq_paragraphs) | ||
|
|
||
| # ... any tag with id containing "prereq" (EN) or "prerreq" (ES) | ||
| prereq_paragraphs = fetch_prerequisites_from_id_or_class(html, prereq_paragraphs) if prereq_paragraphs.empty? | ||
|
|
||
| prereq_paragraphs&.join("\n")&.gsub(/\n\n+/, "\n")&.strip || '' | ||
| end | ||
|
|
||
| def fetch_prerequisites_from_h(html, prereq_paragraphs) | ||
| html.xpath('//h1|//h2|//h3|//h4|//h5|//h6|//span').each do |h| | ||
| next unless h.text =~ /prereq|prerreq/i # if prereq in text | ||
|
|
||
| paragraph = h.xpath('following-sibling::*') | ||
| .take_while { |sib| %w[p ul ol].include?(sib.name) } # take either p, ul or ol | ||
| prereq_paragraphs.concat(paragraph) if paragraph | ||
| end | ||
| prereq_paragraphs | ||
| end | ||
|
|
||
| def fetch_prerequisites_from_id_or_class(html, prereq_paragraphs) | ||
| html.xpath('//*[@id]').each do |node| | ||
| next unless prereq_node?(node) | ||
|
|
||
| extract_following_paragraphs(node, prereq_paragraphs) | ||
| extract_nested_paragraphs(node, prereq_paragraphs) if prereq_paragraphs.empty? | ||
| end | ||
| prereq_paragraphs | ||
| end | ||
|
|
||
| def prereq_node?(node) | ||
| [node['id'], node['class']].compact.any? { |attr| attr =~ /prereq|prerreq/i } | ||
| end | ||
|
|
||
| def extract_following_paragraphs(node, prereq_paragraphs) | ||
| paragraphs = node.xpath('following-sibling::*') | ||
| .take_while { |sib| %w[p ul ol].include?(sib.name) } | ||
| prereq_paragraphs.concat(paragraphs) if paragraphs | ||
| end | ||
|
|
||
| def extract_nested_paragraphs(node, prereq_paragraphs) | ||
| paragraphs = node.xpath('.//p | .//ul | .//ol') | ||
| prereq_paragraphs.concat(paragraphs) if paragraphs.any? | ||
| end | ||
| end | ||
| end | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| { | ||
| "name": "cpluspluscourse", | ||
| "full_name": "hsf-training/cpluspluscourse", | ||
| "owner": { | ||
| "login": "hsf-training" | ||
| }, | ||
| "html_url": "https://github.com/hsf-training/cpluspluscourse", | ||
| "description": "C++ Course Taught at CERN", | ||
| "homepage": "", | ||
| "topics": [ | ||
| "those", | ||
| "are", | ||
| "keywords" | ||
| ], | ||
| "license": { | ||
| "key": "apache-2.0", | ||
| "name": "Apache License 2.0", | ||
| "spdx_id": "Apache-2.0" | ||
| }, | ||
| "archived": true, | ||
| "created_at": "2025-09-29T14:38:38Z", | ||
| "updated_at": "2025-09-30T14:38:38Z", | ||
| "pushed_at": "2025-09-28T14:38:38Z", | ||
| "contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| { | ||
| "name": "python-novice-inflammation", | ||
| "full_name": "swcarpentry/python-novice-inflammation", | ||
| "owner": { | ||
| "login": "swcarpentry" | ||
| }, | ||
| "html_url": "https://github.com/swcarpentry/python-novice-inflammation", | ||
| "description": "This is not going to be read", | ||
| "homepage": "https://swcarpentry.github.io/python-novice-inflammation/", | ||
| "topics": [ | ||
| "key", | ||
| "words", | ||
| "in topics" | ||
| ], | ||
| "license": { | ||
| "key": "apache-2.0", | ||
| "name": "Apache License 2.0", | ||
| "spdx_id": "Apache-2.0" | ||
| }, | ||
| "archived": false, | ||
| "created_at": "2025-09-29T14:38:38Z", | ||
| "updated_at": "2025-09-30T14:38:38Z", | ||
| "pushed_at": "2025-09-28T14:38:38Z", | ||
| "contributors_url": "https://api.github.com/repos/swcarpentry/python-novice-inflammation/contributors" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,25 @@ | ||
| { | ||
| "name": "hsf-training-scikit-hep-webpage", | ||
| "full_name": "hsf-training/hsf-training-scikit-hep-webpage", | ||
| "owner": { | ||
| "login": "hsf-training" | ||
| }, | ||
| "html_url": "https://github.com/hsf-training/hsf-training-scikit-hep-webpage", | ||
| "description": null, | ||
| "homepage": "https://hsf-training.github.io/hsf-training-scikit-hep-webpage/", | ||
| "topics": [ | ||
| "hacktoberfest", | ||
| "hey", | ||
| "test" | ||
| ], | ||
| "license": { | ||
| "key": "other", | ||
| "name": "Other", | ||
| "spdx_id": "NOASSERTION" | ||
| }, | ||
| "archived": false, | ||
| "created_at": "2022-03-23T17:00:05Z", | ||
| "updated_at": "2025-09-29T06:14:55Z", | ||
| "pushed_at": "2025-09-23T20:09:10Z", | ||
| "contributors_url": "https://api.github.com/repos/hsf-training/hsf-training-scikit-hep-webpage/contributors" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| { | ||
| "name": "bigchange", | ||
| "full_name": "hsf-training/cpluspluscourse", | ||
| "html_url": "https://github.com/hsf-training/cpluspluscourse", | ||
| "topics": [ | ||
| "those", | ||
| "are", | ||
| "NOT" | ||
| ], | ||
| "contributors_url": "https://api.github.com/repos/hsf-training/cpluspluscourse/contributors" | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| [ | ||
| { | ||
| "login": "jane" | ||
| }, | ||
| { | ||
| "login": "doe" | ||
| } | ||
| ] |
Uh oh!
There was an error while loading. Please reload this page.