From 3120370736ebc5e518d5fa0bfe212f161cd1499a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 3 Dec 2024 21:40:33 -0500 Subject: [PATCH 1/4] Add github action to codespell master on push and PRs --- .github/workflows/codespell.yml | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .github/workflows/codespell.yml diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 0000000..b026c85 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within .codespellrc +--- +name: Codespell + +on: + push: + branches: [master] + pull_request: + branches: [master] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 From 16442b6facf865d93b8f13f742268e772e93e7ca Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 3 Dec 2024 21:40:33 -0500 Subject: [PATCH 2/4] Add rudimentary codespell config --- .codespellrc | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .codespellrc diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 0000000..6ae4387 --- /dev/null +++ b/.codespellrc @@ -0,0 +1,6 @@ +[codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = .git*,*.pdf,*.svg,*.css,*.min.*,*-min.*,.codespellrc,static +check-hidden = true +ignore-regex = \b(SPACIN|LOD|UE|Alma Mater)\b +# ignore-words-list = From c4e4bbb0425e5174a37494829d46f11409dbe440 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 3 Dec 2024 21:43:02 -0500 Subject: [PATCH 3/4] [DATALAD RUNCMD] run codespell throughout fixing typos automagically (but ignoring overall fail due to ambigous ones) === Do not change lines below === { "chain": [], "cmd": "codespell -w || :", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- auth_example.json | 4 ++-- html-template/about.html | 2 +- html-template/ccc.html | 6 +++--- html-template/datasets.html | 2 +- html-template/download_legacy.html | 2 +- html-template/home.html | 2 +- html-template/meta.html | 4 ++-- html-template/tools.html | 2 +- src/ramose.py | 8 ++++---- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/auth_example.json b/auth_example.json index 15e89dc..8a69174 100644 --- a/auth_example.json +++ b/auth_example.json @@ -24,7 +24,7 @@ "invalid_captcha": "This message warns of an error with the captcha", "invalid_form": "This message warns that a form cannot be resubmitted" }, - "accesstokensuccess": "Sucess message displayed after an access token request", + "accesstokensuccess": "Success message displayed after an access token request", "accesstokenconfirm": { "success": "Access token confirmation ", "failure": "Message in case of link expiration" @@ -36,7 +36,7 @@ "token_button": "Toke button message", "ignore": "Ignore message in the case of wrong mail", "signature": "Signature of the email", - "html_message": "Message displayed in the case HTML email is not supported, the link is concatened to this string" + "html_message": "Message displayed in the case HTML email is not supported, the link is concatenated to this string" } } } diff --git a/html-template/about.html b/html-template/about.html index fc0e8a7..3e0076e 100644 --- a/html-template/about.html +++ b/html-template/about.html @@ -94,7 +94,7 @@

International Advisory Board for Open

Vincent Larivière (Canadian Research Chair on the Transformation of Scholarly Communication, University of Montréal)

  • Catriona MacCallum (Director of Open Science, Hindawi Open Access Publisher)

  • Cameron Neylon (Professor of Research Communications, Centre for Culture and Technology, Curtin University)

  • -
  • Katherine Skinner (Reasearch Lead, Invest in Open)

  • +
  • Katherine Skinner (Research Lead, Invest in Open)

  • Didier Torny (on behalf of the French Open Science Committee)

  • Ludo Waltman (Professor of Quantitative Science Studies and Deputy Director, Centre for Science and Technology Studies (CWTS), Leiden University) [chair]

  • diff --git a/html-template/ccc.html b/html-template/ccc.html index 0bc590c..e312a36 100644 --- a/html-template/ccc.html +++ b/html-template/ccc.html @@ -100,13 +100,13 @@

    Scope

  • Sequence number: along with XPath identifiers, discourse elements are identified with a more human-readable sequence number (e.g. Section n. 1, Paragraph n. 3, Table n. 2), - indentifying their relative position in the document. CCC stores sequence numbers of discourse elements that include at least one in-text reference.

  • + identifying their relative position in the document. CCC stores sequence numbers of discourse elements that include at least one in-text reference.

  • OCI: an OCI is a global persistent identifier of citations. It usually appears in the form oci:<citing>-<cited> where citing and cited are locally assigned numerical identifiers of respectively a citing document and a cited document. In CCC an OCI is assigned to both the general citation - in the same form <citing>-<cited> - and to every occurrence of an in-text reference in the citing document relevant to that citation. For instance: the article identified as 0701 in CCC cites the article identified as 07090, and two in-text references appear in the citing article referencing the cited article. - The general OCI for the citation will be 0701-07090, while the two specific citations instatiated by in-text references will be addressed as 0701-07090/1 and 0701-07090/2 respectively.

  • + The general OCI for the citation will be 0701-07090, while the two specific citations instantiated by in-text references will be addressed as 0701-07090/1 and 0701-07090/2 respectively.

  • InTRePID: the In-Text Reference Pointer Identifier (InTRePID) is a global unique persistent identifier (PID) of in-text reference pointers. - InTRePID is an extention of OCI that appears in the following form: intrepid:<oci>/<ordinal>-<total> + InTRePID is an extension of OCI that appears in the following form: intrepid:<oci>/<ordinal>-<total> where <oci> is is the numerical part of the OCI identiying a citation between a citing and cited entity, <ordinal> is the nth occurrence of an in-text reference pointer within the text of the citing entity relevant to the cited entity addressed in the OCI, and <total> is the total number of in-text reference pointers diff --git a/html-template/datasets.html b/html-template/datasets.html index 47a84a9..04a7810 100644 --- a/html-template/datasets.html +++ b/html-template/datasets.html @@ -82,7 +82,7 @@

    Datasets

    • -

      OpenCitations Index. The Index contain information about the citations themselves, in which the citations, instead of being considered as simple links, are treated as first-class data entities in their own right. The Index do not store metadata about the citing and cited bibliographic entities internally. Rather, these entities are identified in the Index by their unique identifiers, i.e. the OpenCitations Meta Indentifier (OMID), enabling bibliographic information to be retrieved on-the-fly upon request by means of the OpenCitations Meta API. +

      OpenCitations Index. The Index contain information about the citations themselves, in which the citations, instead of being considered as simple links, are treated as first-class data entities in their own right. The Index do not store metadata about the citing and cited bibliographic entities internally. Rather, these entities are identified in the Index by their unique identifiers, i.e. the OpenCitations Meta Identifier (OMID), enabling bibliographic information to be retrieved on-the-fly upon request by means of the OpenCitations Meta API.

    • OpenCitations Meta. OpenCitations Meta stores and delivers bibliographic metadata for all publications involved in the OpenCitations Index.

    diff --git a/html-template/download_legacy.html b/html-template/download_legacy.html index 21faea4..1ac3165 100644 --- a/html-template/download_legacy.html +++ b/html-template/download_legacy.html @@ -78,7 +78,7 @@

    $active.title()

    -

    This page is a legacy page (not linked anymore from the official website) that links all the dumps produced by OpenCitations before October 2023 that are not mantained anymore.

    +

    This page is a legacy page (not linked anymore from the official website) that links all the dumps produced by OpenCitations before October 2023 that are not maintained anymore.

    This page details of and links to all the data dumps of the OpenCitations Indexes, the Open Biomedical Citations in Context Corpus and of the OpenCitations Corpus (OCC). They are made available online by means of the support of Figshare and of the Internet Archive.

    Each dump of an OpenCitations Index is composed by four zip archives. Two of these archives contains the actual data and provenance information of the index in N-Triples, while the other two archives contain the same information in CSV.

    Each dump of the Open Biomedical Citations in Context Corpus is composed by one single zip artchive containing all the information about actual data and provenance stored in JSON-LD.

    diff --git a/html-template/home.html b/html-template/home.html index 08a7439..85e30db 100644 --- a/html-template/home.html +++ b/html-template/home.html @@ -85,7 +85,7 @@

    OpenCitations is an independent not-for-profit infrastructure organization for open scholarship dedicated to the publication of open bibliographic and citation data by the use of Semantic Web (Linked Data) technologies. It is also engaged in advocacy for open citations, particularly in its role as a key founding member of the Initiative for Open Citations (I4OC). For administrative convenience, OpenCitations is managed by the Research Centre for Open Scholarly Metadata at the University of Bologna.

    -

    OpenCitations espouses fully the founding principles of Open Science. It complies with the FAIR data principles by Force11 that data should be findable, accessible, interoperable and re-usable, and it complies with the recommendations of I4OC that citation data in particular should be structured, separable, and open. On the latter topic, OpenCitations has recently published a formal definition of an Open Citation, and has launched a system for globally unique and persistent identifiers (PIDs) for bibliographic citations – Open Citation Identifiers (OCIs).

    +

    OpenCitations espouses fully the founding principles of Open Science. It complies with the FAIR data principles by Force11 that data should be findable, accessible, interoperable and reusable, and it complies with the recommendations of I4OC that citation data in particular should be structured, separable, and open. On the latter topic, OpenCitations has recently published a formal definition of an Open Citation, and has launched a system for globally unique and persistent identifiers (PIDs) for bibliographic citations – Open Citation Identifiers (OCIs).

    OpenCitations' involvement in international networks and collaborations, together with the need of identifying and reaching out to new stakeholders to assure OpenCitations' development and sustainability, has made it necessary to define OpenCitations' mission, unique strengths and next developmental steps, summarized in the following publicly available documents: OpenCitations Mission Statement, The Uniqueness of OpenCitations and OpenCitations – Present Status and Future Plans.

    diff --git a/html-template/meta.html b/html-template/meta.html index 261a644..deb33c8 100644 --- a/html-template/meta.html +++ b/html-template/meta.html @@ -80,7 +80,7 @@

    OpenCitations Meta

    The OpenCitations Meta database stores and delivers bibliographic metadata for all publications involved in the OpenCitations Index.

    -

    For each publication, the metadata exposed by OpenCitations Meta includes the publication's title, type, venue (e.g. journal name), volume number, issue number, page numbers, publication date, and identifiers such as Digital Object Identifiers (DOIs) and PubMed Identifiers (PMIDs). In addition, OpenCitations Meta includes details of the main actors involved in the publication of the document, i.e., the names of the authors, editors, and publishers, each with its own additional metadata and identifier (e.g. ORCID).

    +

    For each publication, the metadata exposed by OpenCitations Meta includes the publication's title, type, venue (e.g. journal name), volume number, issue number, page numbers, publication date, and identifiers such as Digital Object Identifiers (DOIs) and PubMed Identifiers (PMIDs). In addition, OpenCitations Meta includes details of the main actors involved in the publication of the document, i.e., the names of the authors, editors, and publishers, each with its own additional metadata and identifier (e.g. ORCID).

    Currently, OpenCitations Meta contains:

      @@ -98,7 +98,7 @@

      Entity URLs

      The OpenCitations Meta Identifier

      Every entity in OpenCitations Meta is assigned persistent internal identifier called OpenCitations Meta Identifier (OMID). The OMID has structure [[entity_type_abbreviation]]/[[supplier_prefix]][[sequential_number]]. - For example, the first journal article ever processed has OMID br/0601 (the full URI is https://w3id.org/oc/meta/br/0601), where br is the abbreviation of bibliographic resource, and 060 corresponds to the supplier prefix, helpful in recognising at a glance the index it belongs to (i.e., OpenCitations Meta). Finally, 1 indicates that this is the index's first bibliographic resource ever minted.

      + For example, the first journal article ever processed has OMID br/0601 (the full URI is https://w3id.org/oc/meta/br/0601), where br is the abbreviation of bibliographic resource, and 060 corresponds to the supplier prefix, helpful in recognising at a glance the index it belongs to (i.e., OpenCitations Meta). Finally, 1 indicates that this is the index's first bibliographic resource ever minted.

      The entities subject to deduplication and associated with an OMID are identifiers (abbr. id), agent roles (i.e., authors, editors, publishers, abbr. ar), responsible agents (i.e., people and organisations, abbr. ra), resource embodiments (i.e., pages, abbr. re), diff --git a/html-template/tools.html b/html-template/tools.html index 937f484..1eb879f 100644 --- a/html-template/tools.html +++ b/html-template/tools.html @@ -90,7 +90,7 @@

      Software

      OpenCitations created several software applications and libraries used to create all the data stored in the various databases. In particular:

      • BEE, a software for parsing articles from the Open Access Subset of biomedical literature hosted by Europe PubMed Central (EPMC) and encoded in JATS/XML; we have recently extended it for creating the CCC corpus, which now allows to extract also in-text references and discourse elements (e.g. sentences, paragraphs, sections)

      • -
      • SPACIN, a software for transforming JSON data (output of BEE) into RDF according to the OCDM; we have recently extended it for creating the CCC corpus, so as to reengineer information about in-text references and discourse elements.

      • +
      • SPACIN, a software for transforming JSON data (output of BEE) into RDF according to the OCDM; we have recently extended it for creating the CCC corpus, so as to reengineer information about in-text references and discourse elements.

      • Create New Citations, a software for creating citation indexes (i.e. doi-to-doi citations) according to OCDM;

      • BCite, a user-friendly web application - released in beta version - for creating RDF-data according to OCDM from lists of bibliographic references;

      diff --git a/src/ramose.py b/src/ramose.py index ae983e1..dd20bec 100644 --- a/src/ramose.py +++ b/src/ramose.py @@ -19,7 +19,7 @@ # RAMOSE v1.1 # Ivan Heibi # [+] The preprocessing operation can output a list of values, this means Ramose will perform multiple sequential SPARQL queries (using the SPARQL query defined for that operation). -# The marge of the final result is performed by the post processing operation. (e.g., see /venue-citation-count in indexapi_v2.py) +# The merge of the final result is performed by the post processing operation. (e.g., see /venue-citation-count in indexapi_v2.py) __author__ = 'essepuntato' @@ -206,7 +206,7 @@ def __parameters(self): 4. `format=`: the final table is returned in the format specified in `` that can be either "csv" or "json" - e.g. `format=csv` returns the final table in CSV format. This parameter has higher priority of the type specified through the "Accept" header of the request. Thus, if the header of a request to the API specifies `Accept: text/csv` and the URL of such request includes `format=json`, the final table is returned in JSON. -5. `json=("",,,,...)`: in case a JSON format is requested in return, tranform each row of the final JSON table according to the rule specified. If `` is set to "array", the string value associated to the field name `` is converted into an array by splitting the various textual parts by means of ``. For instance, considering the JSON table `[ { "names": "Doe, John; Doe, Jane" }, ... ]`, the execution of `array("; ",names)` returns `[ { "names": [ "Doe, John", "Doe, Jane" ], ... ]`. Instead, if `` is set to "dict", the string value associated to the field name `` is converted into a dictionary by splitting the various textual parts by means of `` and by associating the new fields ``, ``, etc., to these new parts. For instance, considering the JSON table `[ { "name": "Doe, John" }, ... ]`, the execution of `dict(", ",name,fname,gname)` returns `[ { "name": { "fname": "Doe", "gname": "John" }, ... ]`. +5. `json=("",,,,...)`: in case a JSON format is requested in return, transform each row of the final JSON table according to the rule specified. If `` is set to "array", the string value associated to the field name `` is converted into an array by splitting the various textual parts by means of ``. For instance, considering the JSON table `[ { "names": "Doe, John; Doe, Jane" }, ... ]`, the execution of `array("; ",names)` returns `[ { "names": [ "Doe, John", "Doe, Jane" ], ... ]`. Instead, if `` is set to "dict", the string value associated to the field name `` is converted into a dictionary by splitting the various textual parts by means of `` and by associating the new fields ``, ``, etc., to these new parts. For instance, considering the JSON table `[ { "name": "Doe, John" }, ... ]`, the execution of `dict(", ",name,fname,gname)` returns `[ { "name": { "fname": "Doe", "gname": "John" }, ... ]`. It is possible to specify one or more filtering operation of the same kind (e.g. `require=given_name&require=family_name`). In addition, these filtering operations are applied in the order presented above - first all the `require` operation, then all the `filter` operations followed by all the `sort` operation, and finally the `format` and the `json` operation (if applicable). It is worth mentioning that each of the aforementioned rules is applied in order, and it works on the structure returned after the execution of the previous rule. @@ -870,7 +870,7 @@ def __init__(self, op_complete_url, op_key, i, tp, sparql_http_method, addon): It takes in input a full URL referring to a call to an operation (parameter 'op_complete_url'), the particular shape representing an operation (parameter 'op_key'), the definition (in JSON) of such operation (parameter 'i'), the URL of the triplestore to contact (parameter 'tp'), the HTTP method - to use for the SPARQL request (paramenter 'sparql_http_method', set to either 'get' or 'post'), and the path + to use for the SPARQL request (parameter 'sparql_http_method', set to either 'get' or 'post'), and the path of the Python file which defines additional functions for use in the operation (parameter 'addon').""" self.url_parsed = urlsplit(op_complete_url) self.op_url = self.url_parsed.path @@ -1332,7 +1332,7 @@ def exec(self, method="get", content_type="application/json"): for combination in combinations: parameters_comb.append( dict(zip(list(par_dict.keys()), list(combination))) ) - # the __parameters_comb__ varaible is a list of dictionaries, + # the __parameters_comb__ variable is a list of dictionaries, # each dictionary stores a possible combination of parameter values # # Example: {"id":"5","area":["A1","A2"]} -> [ {"id":"5","area":"A1"}, {"id":"5","area":"A2"} ] From 3b463c7744d80e1933f5480fa6c6f1c19717caad Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Tue, 3 Dec 2024 21:43:58 -0500 Subject: [PATCH 4/4] [DATALAD RUNCMD] Do interactive fixing of some ambigous typos === Do not change lines below === { "chain": [], "cmd": "codespell -w -i 3 -C 2", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- auth_example.json | 2 +- html-template/ccc.html | 2 +- src/intrepid.py | 2 +- src/oci.py | 2 +- src/ramose.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/auth_example.json b/auth_example.json index 8a69174..f185673 100644 --- a/auth_example.json +++ b/auth_example.json @@ -33,7 +33,7 @@ "title": "Email title", "description": "Email description", "token" : "Message before the button", - "token_button": "Toke button message", + "token_button": "Token button message", "ignore": "Ignore message in the case of wrong mail", "signature": "Signature of the email", "html_message": "Message displayed in the case HTML email is not supported, the link is concatenated to this string" diff --git a/html-template/ccc.html b/html-template/ccc.html index e312a36..1be1e9d 100644 --- a/html-template/ccc.html +++ b/html-template/ccc.html @@ -201,7 +201,7 @@

      An example

      datacite:usesIdentifierScheme datacite:oci ; literal:hasLiteralValue "oci:0701-0702/2". -

      The first in-text reference rp/0701 appears as "Doe et al. 2020". It appears in the firt section de/0701, called "Introduction", +

      The first in-text reference rp/0701 appears as "Doe et al. 2020". It appears in the first section de/0701, called "Introduction", second paragraph de/0702, third sentence de/0703 (being section, paragraph and sentence numbers relative to the entire document and not to the parent element). Both in-text references and the discourse elements are also identified by a XPath.

      # the sentence
      diff --git a/src/intrepid.py b/src/intrepid.py
      index eaf870a..d51efcd 100644
      --- a/src/intrepid.py
      +++ b/src/intrepid.py
      @@ -246,7 +246,7 @@ def validate(self):
                   if not self.intrepid.startswith("intrepid:"):
                       self.intrepid = "intrepid:" + self.intrepid
                       self.add_message("validate", W, "The InTRePID specified as input doesn't start with the 'intrepid:' "
      -                                                "prefix. This has beed automatically added, resulting in "
      +                                                "prefix. This has been automatically added, resulting in "
                                                       "the InTRePID '%s'." % self.intrepid)
       
                   self.is_valid = False
      diff --git a/src/oci.py b/src/oci.py
      index 51d2d5b..cd1662b 100644
      --- a/src/oci.py
      +++ b/src/oci.py
      @@ -711,7 +711,7 @@ def validate(self):
                   if not self.oci.startswith("oci:"):
                       self.oci = "oci:" + self.oci
                       self.add_message("validate", W, "The OCI specified as input doesn't start with the 'oci:' "
      -                                                "prefix. This has beed automatically added, resulting in "
      +                                                "prefix. This has been automatically added, resulting in "
                                                       "the OCI '%s'." % self.oci)
       
                   self.is_valid = False
      diff --git a/src/ramose.py b/src/ramose.py
      index dd20bec..276e5f3 100644
      --- a/src/ramose.py
      +++ b/src/ramose.py
      @@ -909,7 +909,7 @@ def conv(s, query_string, c_type="text/csv"):
       
               content_type = Operation.get_content_type(c_type)
       
      -        # Overrite if requesting a particular format via the URL
      +        # Override if requesting a particular format via the URL
               if "format" in query_string:
                   req_formats = query_string["format"]
       
      @@ -1363,7 +1363,7 @@ def exec(self, method="get", content_type="application/json"):
       
                           sc = r.status_code
                           if sc == 200:
      -                        # This line has been added to avoid a strage behaviour of the 'splitlines' method in
      +                        # This line has been added to avoid a strange behaviour of the 'splitlines' method in
                               # presence of strange characters (non-UTF8).
                               list_of_lines = [line.decode("utf-8") for line in r.text.encode("utf-8").splitlines()]