- add license information to README.rst

RootLUG · RootLUG · commit c6b79d88f35d · 2020-11-03T13:01:20.000+01:00
- publish pypi dataset and add it to docs
diff --git a/README.rst b/README.rst
@@ -83,3 +83,22 @@ Authors & Contributors
 
 * **Martin Carnogursky** - *Initial work and project lead* - https://is.muni.cz/person/410345
 * **Mirza Zulfan** - *Logo Design* - https://github.com/mirzazulfan
+
+
+LICENSE
+=======
+Aura framework is licensed under the **GPL-3.0**.
+Datasets produced from global scans using Aura are released under the **CC BY-NC 4.0** license.
+Use the following citation when using Aura or data produced by Aura in research:
+
+::
+
+    @misc{Carnogursky2019thesis,
+    AUTHOR = "CARNOGURSKY, Martin",
+    TITLE = "Attacks on package managers [online]",
+    YEAR = "2019 [cit. 2020-11-02]",
+    TYPE = "Bachelor Thesis",
+    SCHOOL = "Masaryk University, Faculty of Informatics, Brno",
+    SUPERVISOR = "Vit Bukac",
+    URL = "Available at WWW <https://is.muni.cz/th/y41ft/>",
+    }
diff --git a/aura/cli.py b/aura/cli.py
@@ -220,16 +220,16 @@ def update_aura():
 
 
 @cli.command()
-@click.option("-o", "--out", default="-", type=click.File("w"))
 @click.option("-m", "--max-distance", default=2, type=click.IntRange(min=0, max=10))
 @click.option("-l", "--limit", default=100, type=click.INT)
+@click.option("-f", "--format", default="text")
 @click.argument("pkg", nargs=-1)
-def find_typosquatting(out, max_distance, limit=100, pkg=None):
+def find_typosquatting(max_distance, limit=100, pkg=None, format="text"):
     if limit <= 0:
         click.secho("Invalid value for limit", file=sys.stderr)
         sys.exit(1)
 
-    commands.generate_typosquatting(out=out, distance=max_distance, limit=limit, pkgs=pkg)
+    commands.generate_typosquatting(distance=max_distance, limit=limit, pkgs=pkg, format_uri=format)
 
 
 @cli.command()
diff --git a/aura/commands.py b/aura/commands.py
@@ -8,6 +8,7 @@
 from concurrent import futures
 from pathlib import Path
 from functools import partial
+from itertools import islice
 from typing import Union, Optional, Tuple, Generator, List
 
 import click
@@ -21,9 +22,8 @@
 from . import utils
 from . import mirror
 from . import typos
-from .package import PypiPackage
 from .analyzers.detections import Detection
-from .output.base import ScanOutputBase, DiffOutputBase, InfoOutputBase
+from .output.base import ScanOutputBase, DiffOutputBase, InfoOutputBase, TyposquattingOutputBase
 
 
 logger = config.get_logger(__name__)
@@ -214,28 +214,15 @@ def show_info():
     formatter.output_info_data(info_data)
 
 
-def generate_typosquatting(out, distance=2, limit=None, pkgs=None):
-    from .output.text import PrettyReport
-
-    p = PrettyReport()  # TODO: convert into plugin system
-
+def generate_typosquatting(distance=2, limit=None, pkgs=None, format_uri="text"):
     if not pkgs:
         pkgs = typos.get_popular_packages()
 
     f = partial(typos.damerau_levenshtein, max_distance=distance)
     combinations = typos.generate_combinations(left=pkgs)
 
-    for idx, data in enumerate(typos.enumerator(combinations, f)):
-        if limit and idx >= limit:
-            break
-
-        try:
-            diff_table = data["orig_pkg"]._cmp_info(data["typo_pkg"])
-            t1 = data["orig_score"].get_score_table()
-            t2 = data["typo_score"].get_score_table()
-            p.print_tables(t1, t2, diff_table)
-        except exceptions.NoSuchPackage:
-            continue
+    formatter = TyposquattingOutputBase.from_uri(format_uri)
+    formatter.output_typosquatting(islice(typos.enumerator(combinations, f), 0, limit))
 
 
 def prefetch(*uris):
diff --git a/aura/output/base.py b/aura/output/base.py
@@ -55,7 +55,7 @@ def get_format(cls, uri: str, parsed=None) -> OutputBase:
             elif fmt.is_supported(parsed_uri=parsed):
                 return fmt
 
-        raise exceptions.InvalidOutput("No such output format")
+        raise exceptions.InvalidOutput(f"No such output format `{uri}`")
 
 
 @dataclass()
@@ -165,6 +165,21 @@ def filtered(self, hits):
         return processed
 
 
+@dataclass()
+class TyposquattingOutputBase(OutputBase, metaclass=ABCMeta):
+    @classmethod
+    def entrypoint(cls) -> str:
+        return "aura.typosquatting_output_handlers"
+
+    @classmethod
+    def from_uri(cls, uri: str) -> TyposquattingOutputBase:
+        return cls.get_format(uri)()
+
+    @abstractmethod
+    def output_typosquatting(self, entries):
+        ...
+
+
 @dataclass()
 class DiffOutputBase(OutputBase, metaclass=ABCMeta):
     detections: bool = True
diff --git a/aura/output/json.py b/aura/output/json.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 from typing import Any
 
-from .base import ScanOutputBase, DiffOutputBase
+from .base import ScanOutputBase, DiffOutputBase, TyposquattingOutputBase
 from ..type_definitions import DiffType, DiffAnalyzerType
 from ..json_proxy import dumps
 
@@ -97,3 +97,19 @@ def output_diff(self, diff_analyzer: DiffAnalyzerType):
             payload["diffs"].append(diff)
 
         print(dumps(payload), file=self._fd)
+
+
+class JSONTyposquattingOutput(TyposquattingOutputBase):
+    @classmethod
+    def protocol(cls) -> str:
+        return "json"
+
+    def output_typosquatting(self, entries):
+        for x in entries:
+            data = {
+                "original": x["original"],
+                "typosquatting": x["typo"],
+                "original_score": x["orig_score"].get_score_matrix(),
+                "typosquatting_score": x["typo_score"].get_score_matrix(),
+            }
+            print(dumps(data))
diff --git a/aura/output/text.py b/aura/output/text.py
@@ -6,15 +6,15 @@
 from dataclasses import dataclass
 from textwrap import wrap
 from prettyprinter import pformat
-from typing import Optional, Any
+from typing import Optional, Any, Generator
 from collections import Counter
 
 from click import secho, style
 
 from .. import utils
 from .. import config
 from ..analyzers.detections import get_severity
-from .base import ScanOutputBase, DiffOutputBase, InfoOutputBase
+from .base import ScanOutputBase, DiffOutputBase, InfoOutputBase, TyposquattingOutputBase
 from .table import Table
 
 
@@ -429,6 +429,20 @@ def output_info_data(self, data):
         out.print_bottom_separator()
 
 
+class TextTyposquattingOutput(TyposquattingOutputBase):
+    @classmethod
+    def protocol(cls) -> str:
+        return "text"
+
+    def output_typosquatting(self, entries):
+        out = PrettyReport()
+        for x in entries:
+            diff_table = x["orig_pkg"]._cmp_info(x["typo_pkg"])
+            orig_table = x["orig_score"].get_score_table()
+            typo_table = x["typo_score"].get_score_table()
+            out.print_tables(orig_table, typo_table, diff_table)
+
+
 @dataclass()
 class TextDiffOutput(TextBase, DiffOutputBase):
     _fd: Any = None
diff --git a/aura/typos.py b/aura/typos.py
@@ -201,10 +201,9 @@ def enumerator(
     pkg_cache = {}
     pkg_score_cache = {}
 
-    for num, (orig, typo) in enumerate(generator):
+    for (orig, typo) in generator:
         res = method(orig, typo)
         if res and res < 2:
-
             if orig not in pkg_cache:
                 orig_pkg = package.PypiPackage.from_pypi(orig)
                 pkg_cache[orig] = orig_pkg
diff --git a/docs/source/cookbook/misc/global_pypi_scan.rst b/docs/source/cookbook/misc/global_pypi_scan.rst
@@ -17,6 +17,7 @@ The PyPI scan itself is done on a separate high-performance worker node. While i
 
 The worker node has a fast SSD disk dedicated to caching the packages for the scan that are prefetched from the offline PyPI mirror right before the scan starts. After the prefetch is completed a full scan of all packages is conducted by running parallel Aura scans. All scripts used on the worker node are available under the ``files/`` directory at the root of the Aura repository.
 
+The full list of published PyPI datasets is available at: https://cdn.sourcecode.ai/pypi_datasets/index/datasets.html
 
 Technical specification of the worker node:
 
@@ -25,5 +26,37 @@ CPU   AMD Ryzen 9 3900X 12-Core Processor
 RAM   HyperX 32GB Kit DDR4 3200MHz CL16 XMP
 GPU   SAPPHIRE NITRO+ Radeon RX 580 OC 8G
 Disk  2x Intel 660p M.2 2TB SSD NVMe
-OS    Windows 10 with Aura running inside WSL2 Ubuntu 18.04
+OS    Arch Linux (fully updated prior to scan)
 ===== =====
+
+
+Description of the dataset
+--------------------------
+
+Data produced from global scans are distributed via magnet (torrent) links with metadata hosted on SourceCode.AI CDN. The dataset content is as follows:
+
+- **dataset.zst** - Single file dataset compressed using `ZSTD <https://facebook.github.io/zstd/>`_. Each line contains a compact JSON per scanned PyPI package
+- **joblog.txt** - Joblog file from GNU Parallels
+- **input_packages.txt** - List of PyPI packages passed as input for the global PyPI scan
+- **package_list.txt** - List of PyPI packages actually processed by Aura during the scan, each package listed in this file has an entry in a dataset.zst file
+- **checksums.md5.txt** - List of MD5 checksums for all files contained within the dataset
+- **README.txt** - License & copy of this description
+
+You may have noted that there is a difference between the file ``input_packages.txt`` and ``package_list.txt``. The input file is generally larger and is of all packages contained in our offline PyPI mirror at the start of a global scan. However, some packages may have not any releases published and so they would be skipped by Aura during the actual scan. Other reasons may include that the package has a corrupted archive, timeout for a scan has been reached or Aura crashed during the scan of a package. This is the reason why the input package list is always larger than the actual list produced by Aura during/after the scan.
+
+To quickly process or glance at the data, we highly recommend to use the `jq data processor <https://stedolan.github.io/jq/>`_ .
+
+The dataset is released under the `CC BY-NC 4.0 license <https://creativecommons.org/licenses/by-nc/4.0/>`_ .
+Use the following citation to give attribution to the original research paper:
+
+::
+
+    @misc{Carnogursky2019thesis,
+    AUTHOR = "CARNOGURSKY, Martin",
+    TITLE = "Attacks on package managers [online]",
+    YEAR = "2019 [cit. 2020-11-02]",
+    TYPE = "Bachelor Thesis",
+    SCHOOL = "Masaryk University, Faculty of Informatics, Brno",
+    SUPERVISOR = "Vit Bukac",
+    URL = "Available at WWW <https://is.muni.cz/th/y41ft/>",
+    }
diff --git a/pyproject.toml b/pyproject.toml
@@ -109,6 +109,10 @@ sqlite = "aura.output.sqlite:SQLiteDiffOutput"
 [tool.poetry.plugins."aura.info_output_handlers"]
 text = "aura.output.text:TextInfoOutput"
 
+[tool.poetry.plugins."aura.typosquatting_output_handlers"]
+text = "aura.output.text:TextTyposquattingOutput"
+json = "aura.output.json:JSONTyposquattingOutput"
+
 [tool.poetry.plugins."aura.diff_hooks"]
 diff_archive = "aura.analyzers.archive:diff_archive"
 
diff --git a/tests/test_typos.py b/tests/test_typos.py
@@ -21,7 +21,7 @@ def test_distance():
 
 
 @patch("aura.typos.get_all_pypi_packages")
-def disabled_test_typosquatting_generator(mock, tmp_path, mock_pypi_stats):  # FIXME
+def disable_test_typosquatting_generator(mock, tmp_path, mock_pypi_stats):  # FIXME
     stats: Path = tmp_path / "pypi_stats.json"
     stats.write_text("\n".join(json.dumps(x) for x in config.iter_pypi_stats()))
     os.environ["AURA_PYPI_STATS"] = str(stats)
@@ -35,10 +35,10 @@ def disabled_test_typosquatting_generator(mock, tmp_path, mock_pypi_stats):  # F
             'grequest',
         ]
 
-        runner = CliRunner()
+        runner = CliRunner(mix_stderr=False)
         result = runner.invoke(
             cli.cli,
-            ['find-typosquatting', '--max-distance', '1', '--limit', '10' ]
+            ['find-typosquatting', '--limit', '10', '-f', 'json'],
         )
         if result.exception:
             raise result.exception