diff --git a/.github/codeql/codeql-config-javascript.yml b/.github/codeql/codeql-config-javascript.yml new file mode 100644 index 00000000..bc404c44 --- /dev/null +++ b/.github/codeql/codeql-config-javascript.yml @@ -0,0 +1,5 @@ +name: "CodeQL config" + +paths: + - project-1 + - project-3 \ No newline at end of file diff --git a/.github/codeql/codeql-config-python.yml b/.github/codeql/codeql-config-python.yml new file mode 100644 index 00000000..be22fddf --- /dev/null +++ b/.github/codeql/codeql-config-python.yml @@ -0,0 +1,4 @@ +name: "CodeQL config" + +paths: + - python-project \ No newline at end of file diff --git a/.github/scripts/list-all b/.github/scripts/list-all new file mode 100755 index 00000000..4ce945d5 --- /dev/null +++ b/.github/scripts/list-all @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# +# This script prints a JSON representation of an include matrix that will create a job for each top level folder/language combination detected in the repository. +# +# {"include": [{"target-dir": ".github", "languages": "javascript"}, {"target-dir": "project with spaces", "languages": "javascript"}, {"target-dir": "project-1", "languages": "javascript"}, {"target-dir": "python-project", "languages": "python"}]} +# + +from genericpath import isdir +import json +import os +import glob + +javascript = [".js", ".jsx", ".mjs", ".es", ".es6", ".htm", ".html", ".xhtm", ".xhtml", ".vue", ".hbs", ".ejs", ".njk", ".json", ".yaml", ".yml", ".raml", ".xml"] +typescript = [".ts", ".tsx", ".mts", ".cts"] +c_and_cplus = [".cpp", ".c++", ".cxx", ".hpp", ".hh", ".h++", ".hxx", ".c," ".cc", ".h"] +csharp = [".sln", ".csproj", ".cs", ".cshtml", ".xaml"] +golang = [".go"] +python_lang = [".py"] +java = [".java"] +ruby = [".rb", ".erb", ".gemspec", "Gemfile"] + +outlines = dict() +outlines["include"] = set() + +def serialize_sets(obj): + if isinstance(obj, set): + l = list() + for item in obj: + if isinstance(item, tuple): + l.append(dict((x, y) for x, y in item)) + return l + +def find_in_list(list, string): + for item in list: + if string.strip().endswith(item): + return True + return False + +for line in glob.glob('**', recursive=True): + path = line.split('/')[0] + if find_in_list(javascript, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "javascript"}).items())) + if find_in_list(typescript, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "javascript"}).items())) + if find_in_list(c_and_cplus, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "cpp"}).items())) + if find_in_list(csharp, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "csharp"}).items())) + if find_in_list(golang, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "go"}).items())) + if find_in_list(python_lang, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "python"}).items())) + if find_in_list(java, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "java"}).items())) + if find_in_list(ruby, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "ruby"}).items())) + +print(json.dumps(outlines, default=serialize_sets)) diff --git a/.github/scripts/list-changed b/.github/scripts/list-changed new file mode 100755 index 00000000..a056d375 --- /dev/null +++ b/.github/scripts/list-changed @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +# +# This script prints a JSON representation of an include matrix - based on the output of git diff - that will create a job for each top level folder/language combination detected in the repository. +# +# {"include": [{"target-dir": ".github", "languages": "javascript"}, {"target-dir": "project with spaces", "languages": "javascript"}, {"target-dir": "project-1", "languages": "javascript"}, {"target-dir": "python-project", "languages": "python"}]} +# + +from genericpath import isdir +import json +import os + +javascript = [".js", ".jsx", ".mjs", ".es", ".es6", ".htm", ".html", ".xhtm", ".xhtml", ".vue", ".hbs", ".ejs", ".njk", ".json", ".yaml", ".yml", ".raml", ".xml"] +typescript = [".ts", ".tsx", ".mts", ".cts"] +c_and_cplus = [".cpp", ".c++", ".cxx", ".hpp", ".hh", ".h++", ".hxx", ".c," ".cc", ".h"] +csharp = [".sln", ".csproj", ".cs", ".cshtml", ".xaml"] +golang = [".go"] +python_lang = [".py"] +java = [".java"] +ruby = [".rb", ".erb", ".gemspec", "Gemfile"] + +lines = list(open("./.github/scripts/diff.txt").readlines()) +outlines = dict() +outlines["include"] = set() + +def serialize_sets(obj): + if isinstance(obj, set): + l = list() + for item in obj: + if isinstance(item, tuple): + l.append(dict((x, y) for x, y in item)) + return l + +def find_in_list(list, string): + for item in list: + if string.strip().endswith(item): + return True + return False + +for line in lines: + path = line.split('/')[0] + if find_in_list(javascript, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "javascript"}).items())) + if find_in_list(typescript, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "javascript"}).items())) + if find_in_list(c_and_cplus, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "cpp"}).items())) + if find_in_list(csharp, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "csharp"}).items())) + if find_in_list(golang, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "go"}).items())) + if find_in_list(python_lang, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "python"}).items())) + if find_in_list(java, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "java"}).items())) + if find_in_list(ruby, line) and (os.path.isdir(path)): + outlines["include"].add(tuple(dict({"target-dir": path, "languages": "ruby"}).items())) + +print(json.dumps(outlines, default=serialize_sets)) diff --git a/.github/workflows/code-scanning.yml b/.github/workflows/code-scanning.yml index 4048ae1e..db1b9772 100644 --- a/.github/workflows/code-scanning.yml +++ b/.github/workflows/code-scanning.yml @@ -1,4 +1,4 @@ -name: Code scanning +name: Code scanning for all apps # # Scan the code using CodeQL whenever new commits are pushed to the main branch @@ -18,48 +18,71 @@ on: paths-ignore: - 'docs/**' - '*' + schedule: + - cron: "35 13 * * 2" + workflow_dispatch: + jobs: - generate-dir-list: + generate-scan-list: + # Find all the top level directories in the repostiory and use them for the scan + # when the workflow is not triggered by a pull_request + # name: Generate directory list runs-on: ubuntu-latest outputs: - dir-list: ${{steps.find-dirs.outputs.dir-list}} + matrix: ${{steps.set-matrix.outputs.matrix}} steps: - - name: Checkout repository - uses: actions/checkout@v2 + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 - # - # Generate a JSON array containing all non-hidden subdirectories of the - # repository's root directory and store it as a job output so it can be - # consumed by all downstream jobs depending on this one. For more - # information about this, visit: - # - # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#jobsjob_idoutputs - # - - name: Find existing directories - id: find-dirs - run: | - echo "::set-output name=dir-list::$(./.github/scripts/list-dirs)" + - name: Find all apps + if: ${{ github.event_name != 'pull_request'}} + id: find-all + run: | + echo "::set-output name=all::$(./.github/scripts/list-all)" + + - name: Find changed apps + if: ${{ github.event_name == 'pull_request'}} + id: find-changed + run: | + git diff --name-only origin/$GITHUB_BASE_REF $GITHUB_SHA >./.github/scripts/diff.txt + echo "::set-output name=changed::$(./.github/scripts/list-changed)" - codeql: - name: Scan code with CodeQL - needs: generate-dir-list + - name: Setup scanning matrix + id: set-matrix + env: + ALL: ${{ steps.find-all.outputs.all }} + CHANGED: ${{ steps.find-changed.outputs.changed }} + run: | + echo "::set-output name=matrix::$ALL$CHANGED" + echo "::notice::All set to $ALL" + echo "::notice::Changed set to $CHANGED" + + - name: Upload diff as artifact + if: ${{ github.event_name == 'pull_request'}} + uses: actions/upload-artifact@v2 + with: + name: diff + path: | + ./.github/scripts/diff.txt + + + + codeql-scan: + name: Scanning ${{matrix.target-dir}} (${{ matrix.languages }}) with CodeQL + needs: generate-scan-list runs-on: ubuntu-latest strategy: - matrix: - target-dir: ${{fromJson(needs.generate-dir-list.outputs.dir-list)}} - # - # Prevent the creation of jobs for directories where code scanning is - # not necessary/desired. - # - exclude: - - target-dir: docs + fail-fast: false + matrix: ${{ fromJson(needs.generate-scan-list.outputs.matrix) }} steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 # # Build the configuration file for CodeQL to instruct it to only scan the @@ -70,16 +93,22 @@ jobs: # - name: Build CodeQL config file env: - TARGET_DIR: ${{matrix.target-dir}} + TARGET_DIR: ${{ matrix.target-dir }} run: | cp .github/codeql/codeql-config-template.yml codeql-config.yml sed -i 's@__TARGET_DIR__@'"$TARGET_DIR"'@' codeql-config.yml - name: Initialize CodeQL - uses: github/codeql-action/init@v1 + uses: github/codeql-action/init@v2 with: config-file: codeql-config.yml - languages: javascript + languages: ${{ matrix.languages }} + + - name: Attempting build + if: ${{ (matrix.languages == 'cpp' || matrix.languages == 'csharp' || matrix.languages == 'java') }} + uses: github/codeql-action/autobuild@v2 - name: Perform CodeQL analysis - uses: github/codeql-action/analyze@v1 \ No newline at end of file + uses: github/codeql-action/analyze@v2 + with: + category: ${{ matrix.target-dir }}-${{ matrix.languages }} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..29127430 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.github/scripts/diff.txt +**/node_modules/** diff --git a/README.md b/README.md index ff796aee..a3760892 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # Parallel code scanning with CodeQL +https://github.com/thedave42/parallel-code-scanning/labels/documentation + If you have a large repository containing various independent projects (a "monorepo"), the time taken to scan your code with CodeQL can be significantly reduced by splitting the scanning work into various parallel jobs which will @@ -20,6 +22,11 @@ this repository (e.g. `project-4`) requires no changes to the workflow file as a dedicated code scanning job will be automatically generated for it when the workflow is executed. +If the workflow is triggered by a pull request the list of sub-directories that +will be scanned will be limited to the subdirectories that contain changes. The +changes are based on a `git diff` between the base and head repositories specified +in the pull request. + This strategy is possible because GitHub Actions workflows accept JSON input to define a job matrix, and the JSON contents can be generated during the workflow's execution. In other words, the job matrix can be defined dynamically. @@ -33,20 +40,7 @@ general capabilities of CodeQL before doing this. ## Answers to common questions -**1.** _Even if files in only one subdirectory in the repository are changed, -code scanning jobs will be generated for all subdirectories containing software -projects, which is wasteful. Is it possible to limit the generation of jobs so -that only subdirectories with modified files will be scanned?_ - -Yes. The list of subdirectories which is used as input for the code scanning job -matrix is produced by a [script](./.github/scripts/list-dirs) which simply -outputs all subdirectories under the repository's root directory. This script -can be modified in any way you want, so you can use [`git -diff`](https://stackoverflow.com/questions/50440420/git-diff-only-show-which-directories-changed) -to build a list containing only subdirectories with modified files and use that -list as input for the job matrix generation. - -**2.** _Every code scanning job checks out the repository in parallel. If a +**1.** _Every code scanning job checks out the repository in parallel. If a change is made to the repository during that time (e.g. a subdirectory is added or removed, or a file in a pre-existing subdirectory is modified), you essentially have a race condition which is not being properly handled._ @@ -63,4 +57,4 @@ very first job which is executed in the workflow and then consuming that artifact in all downstream jobs. The [`actions/upload-artifact`](https://github.com/actions/upload-artifact) and [`actions/download-artifact`](https://github.com/actions/download-artifact) -actions will help you accomplish this. \ No newline at end of file +actions will help you accomplish this. diff --git a/project with spaces/app with spaces.js b/project with spaces/app with spaces.js new file mode 100644 index 00000000..7df7d571 --- /dev/null +++ b/project with spaces/app with spaces.js @@ -0,0 +1 @@ +console.log("Hello World"); \ No newline at end of file diff --git a/project-1/add.js b/project-1/add.js index 1e691765..f1993904 100644 --- a/project-1/add.js +++ b/project-1/add.js @@ -1,5 +1,4 @@ import createMathOperation from './.internal/createMathOperation.js' - /** * Adds two numbers. * diff --git a/python-project/list-changed-dirs.py b/python-project/list-changed-dirs.py new file mode 100755 index 00000000..d51ee957 --- /dev/null +++ b/python-project/list-changed-dirs.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +# +# This script prints a JSON array containing all non-hidden subdirectories of +# the current working directory. As an example, if the current working +# directory contains the subdirectories "foo", "bar" and "baz", the output +# will be (the order of the directories is not necessarily alphabetical): +# +# ["foo", "bar", "baz"] +# +from genericpath import isdir +import json +import os + +lines = list(open('./.github/scripts/diff.txt').readlines()) +outlines = set() + +#only add items that are directories +for line in lines: + path = line.split('/')[0] + if (os.path.isdir(path)): + outlines.add(path) + +print(json.dumps(list(outlines))) diff --git a/python-project/list-changed-langs.py b/python-project/list-changed-langs.py new file mode 100755 index 00000000..544d049f --- /dev/null +++ b/python-project/list-changed-langs.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 +# +# This script prints a JSON array containing all the supported CodeQL programming languages based on the file extension +# +# ["foo", "bar", "baz"] +# +import json + +javascript = [".js", ".jsx", ".mjs", ".es", ".es6", ".htm", ".html", ".xhtm", ".xhtml", ".vue", ".hbs", ".ejs", ".njk", ".json", ".yaml", ".yml", ".raml", ".xml"] +typescript = [".ts", ".tsx", ".mts", ".cts"] +c_and_cplus = [".cpp", ".c++", ".cxx", ".hpp", ".hh", ".h++", ".hxx", ".c," ".cc", ".h"] +csharp = [".sln", ".csproj", ".cs", ".cshtml", ".xaml"] +golang = [".go"] +python_lang = [".py"] +java = [".java"] +ruby = [".rb", ".erb", ".gemspec", "Gemfile"] + + +lines = list(open("./.github/scripts/diff.txt").readlines()) +outlines = set() + +def find_in_list(list, string): + for item in list: + if item in string: + return True + return False + +#only add items that are directories +for line in lines: + if find_in_list(javascript, line): + outlines.add("javascript") + if find_in_list(typescript, line): + outlines.add("javascript") + if find_in_list(javascript, line): + outlines.add("javascript") + + + +print(json.dumps(list(outlines))) diff --git a/.github/scripts/list-dirs b/python-project/list-dirs.py similarity index 87% rename from .github/scripts/list-dirs rename to python-project/list-dirs.py index 3c530b53..1210d0f9 100755 --- a/.github/scripts/list-dirs +++ b/python-project/list-dirs.py @@ -12,4 +12,4 @@ import glob import json -print(json.dumps(glob.glob("*/")).replace("/", "")) \ No newline at end of file +print(json.dumps(glob.glob("*/")).replace("/", ""))