From 1e36d673e7d2dd7a24b6cc41bffa8463eb82f272 Mon Sep 17 00:00:00 2001
From: Stanislav Pankevich <s.pankevich@gmail.com>
Date: Thu, 12 Jun 2025 21:21:40 +0200
Subject: [PATCH] Add boilerplate for automated testing of syntax grammar

---
 .github/workflows/ci-linux-ubuntu-latest.yml  | 38 ++++++++
 .gitignore                                    |  6 ++
 package.json                                  | 17 +++-
 parse_syntax.js                               | 46 +++++++++
 requirements.txt                              |  3 +
 tasks.py                                      | 95 +++++++++++++++++++
 tests/integration/lit.cfg.py                  | 24 +++++
 .../syntax/01_basic_document_node/sample.sdoc |  2 +
 .../syntax/01_basic_document_node/test.itest  |  7 ++
 9 files changed, 233 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/ci-linux-ubuntu-latest.yml
 create mode 100644 parse_syntax.js
 create mode 100644 requirements.txt
 create mode 100644 tasks.py
 create mode 100644 tests/integration/lit.cfg.py
 create mode 100644 tests/integration/syntax/01_basic_document_node/sample.sdoc
 create mode 100644 tests/integration/syntax/01_basic_document_node/test.itest

diff --git a/.github/workflows/ci-linux-ubuntu-latest.yml b/.github/workflows/ci-linux-ubuntu-latest.yml
new file mode 100644
index 0000000..3156386
--- /dev/null
+++ b/.github/workflows/ci-linux-ubuntu-latest.yml
@@ -0,0 +1,38 @@
+name: "StrictDoc.tmLanguage on Linux"
+
+on:
+  pull_request:
+    branches: [ "**" ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Set up Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: '22'
+
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.12
+
+    - name: Upgrade pip
+      run: |
+        python -m pip install --upgrade pip
+
+    - name: Install minimal Python packages
+      run: |
+        pip install -r requirements.txt
+
+    - name: Install Node packages
+      run: |
+        npm install
+
+    - name: Run tests
+      run: |
+        invoke test
diff --git a/.gitignore b/.gitignore
index f414e7b..f771ea3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,10 @@
 .idea
 node_modules
+package-lock.json
 *.vsix
 /_*
+
+# tests/integration
+.lit_test_times.txt
+**/Output/**
+
diff --git a/package.json b/package.json
index 3d70163..2bcdb77 100644
--- a/package.json
+++ b/package.json
@@ -33,10 +33,17 @@
         "configuration": "./language-configuration.json"
       }
     ],
-    "grammars": [{
-      "language": "sdoc",
-      "scopeName": "source.sdoc",
-      "path": "./syntaxes/sdoc.tmLanguage.json"
-    }]
+    "grammars": [
+      {
+        "language": "sdoc",
+        "scopeName": "source.sdoc",
+        "path": "./syntaxes/sdoc.tmLanguage.json"
+      }
+    ]
+  },
+  "dependencies": {
+    "onigasm": "^2.2.5",
+    "vscode-textmate": "^9.2.0",
+    "vscode-oniguruma": "^1.5.1"
   }
 }
diff --git a/parse_syntax.js b/parse_syntax.js
new file mode 100644
index 0000000..4adf460
--- /dev/null
+++ b/parse_syntax.js
@@ -0,0 +1,46 @@
+const fs = require('fs');
+const path = require('path');
+const vsctm = require('vscode-textmate');
+const oniguruma = require('vscode-oniguruma');
+
+const wasmBin = fs.readFileSync(path.join(__dirname, './node_modules/vscode-oniguruma/release/onig.wasm')).buffer;
+const vscodeOnigurumaLib = oniguruma.loadWASM(wasmBin).then(() => {
+    return {
+        createOnigScanner(patterns) { return new oniguruma.OnigScanner(patterns); },
+        createOnigString(s) { return new oniguruma.OnigString(s); }
+    };
+});
+
+const scopeName = "source.sdoc";
+const grammarPath = path.join(__dirname, "syntaxes/sdoc.tmLanguage.json");
+const filePath = process.argv[2];
+if (!fs.existsSync(filePath)) {
+  throw('File does NOT exist');
+}
+
+// Create a registry that can create a grammar from a scope name.
+const registry = new vsctm.Registry({
+  onigLib: vscodeOnigurumaLib,
+  loadGrammar: (scope) => {
+    if (scope === scopeName) {
+      const grammarData = fs.readFileSync(grammarPath, 'utf-8');
+      return Promise.resolve(vsctm.parseRawGrammar(grammarData, grammarPath));
+    }
+    return null;
+  }
+});
+
+registry.loadGrammar(scopeName).then(grammar => {
+    const lines = fs.readFileSync(filePath, 'utf-8').split(/\r?\n/);
+    let ruleStack = vsctm.INITIAL;
+
+    lines.forEach((line, lineIndex) => {
+      const lineTokens = grammar.tokenizeLine(line, ruleStack);
+      ruleStack = lineTokens.ruleStack;
+
+      lineTokens.tokens.forEach(token => {
+        const tokenText = line.slice(token.startIndex, token.endIndex);
+        console.log(`[${lineIndex + 1}:${token.startIndex}-${token.endIndex}] "${tokenText}" → ${token.scopes.join(' ')}`);
+      });
+    });
+  });
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ae7d6d2
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+invoke
+lit
+filecheck>=0.0.20,<1.0.0
diff --git a/tasks.py b/tasks.py
new file mode 100644
index 0000000..048f065
--- /dev/null
+++ b/tasks.py
@@ -0,0 +1,95 @@
+# Invoke is broken on Python 3.11
+# https://github.com/pyinvoke/invoke/issues/833#issuecomment-1293148106
+import inspect
+import os
+import re
+import shutil
+import sys
+import tempfile
+from enum import Enum
+from pathlib import Path
+from typing import Dict, Optional
+
+if not hasattr(inspect, "getargspec"):
+    inspect.getargspec = inspect.getfullargspec
+
+import invoke
+from invoke import task
+
+# Specifying encoding because Windows crashes otherwise when running Invoke
+# tasks below:
+# UnicodeEncodeError: 'charmap' codec can't encode character '\ufffd'
+# in position 16: character maps to <undefined>
+# People say, it might also be possible to export PYTHONIOENCODING=utf8 but this
+# seems to work.
+# FIXME: If you are a Windows user and expert, please advise on how to do this
+# properly.
+sys.stdout = open(1, "w", encoding="utf-8", closefd=False, buffering=1)
+
+
+def run_invoke(
+    context,
+    cmd,
+    environment: Optional[dict] = None,
+    pty: bool = False,
+    warn: bool = False,
+) -> invoke.runners.Result:
+    def one_line_command(string):
+        return re.sub("\\s+", " ", string).strip()
+
+    return context.run(
+        one_line_command(cmd),
+        env=environment,
+        hide=False,
+        warn=warn,
+        pty=pty,
+        echo=True,
+    )
+
+
+@task()
+def test(
+    context,
+    focus=None,
+    debug=False,
+    no_parallelization=False,
+    fail_first=False,
+):
+    clean_itest_artifacts(context)
+
+    cwd = os.getcwd()
+
+    parse_syntax_script = f'node \\"{cwd}/parse_syntax.js\\"'
+
+    debug_opts = "-vv --show-all" if debug else ""
+    focus_or_none = f"--filter {focus}" if focus else ""
+    fail_first_argument = "--max-failures 1" if fail_first else ""
+    parallelize_opts = "" if not no_parallelization else "--threads 1"
+    test_folder = f"{cwd}/tests/integration"
+
+    itest_command = f"""
+        lit
+        --param PARSE_SYNTAX_EXEC="{parse_syntax_script}"
+        -v
+        {debug_opts}
+        {focus_or_none}
+        {fail_first_argument}
+        {parallelize_opts}
+        {test_folder}
+    """
+    run_invoke(
+        context,
+        itest_command,
+    )
+
+@task
+def clean_itest_artifacts(context):
+    # The command sometimes exits with 1 even if the files are deleted.
+    # warn=True ensures that the execution continues.
+    run_invoke(
+        context,
+        """
+        git clean -dX --force --quiet tests/integration/
+        """,
+        warn=True,
+    )
diff --git a/tests/integration/lit.cfg.py b/tests/integration/lit.cfg.py
new file mode 100644
index 0000000..10ad041
--- /dev/null
+++ b/tests/integration/lit.cfg.py
@@ -0,0 +1,24 @@
+# ruff: noqa: F821
+
+import os
+import sys
+from typing import Any
+
+import lit.formats
+
+config: Any
+lit_config: Any
+
+config.name = "StrictDoc integration tests"
+config.test_format = lit.formats.ShTest("0")
+config.suffixes = [".itest"]
+
+current_dir = os.getcwd()
+
+parse_syntax_exec = lit_config.params["PARSE_SYNTAX_EXEC"]
+
+# NOTE: All substitutions work for the RUN: statements but they don't for CHECK:.
+#       That's how LLVM LIT works.
+config.substitutions.append(("%THIS_TEST_FOLDER", '$(basename "%S")'))
+
+config.substitutions.append(("%parse_syntax", parse_syntax_exec))
diff --git a/tests/integration/syntax/01_basic_document_node/sample.sdoc b/tests/integration/syntax/01_basic_document_node/sample.sdoc
new file mode 100644
index 0000000..045d5d4
--- /dev/null
+++ b/tests/integration/syntax/01_basic_document_node/sample.sdoc
@@ -0,0 +1,2 @@
+[DOCUMENT]
+TITLE: Document Title
diff --git a/tests/integration/syntax/01_basic_document_node/test.itest b/tests/integration/syntax/01_basic_document_node/test.itest
new file mode 100644
index 0000000..8adf68e
--- /dev/null
+++ b/tests/integration/syntax/01_basic_document_node/test.itest
@@ -0,0 +1,7 @@
+RUN: %parse_syntax %S/sample.sdoc | filecheck %s
+
+CHECK: [1:0-10] "[DOCUMENT]" → source.sdoc keyword.sdoc
+CHECK: [2:0-5] "TITLE" → source.sdoc keyword.control.sdoc keyword.control.sdoc
+CHECK: [2:5-7] ": " → source.sdoc keyword.control.sdoc
+CHECK: [2:7-22] "Document Title" → source.sdoc keyword.control.sdoc string.sdoc
+CHECK: [3:0-1] "" → source.sdoc