Skip to content

Commit c6b79d8

Browse files
committed
- add license information to README.rst
- publish pypi dataset and add it to docs
1 parent b86df77 commit c6b79d8

File tree

10 files changed

+118
-31
lines changed

10 files changed

+118
-31
lines changed

README.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,22 @@ Authors & Contributors
8383

8484
* **Martin Carnogursky** - *Initial work and project lead* - https://is.muni.cz/person/410345
8585
* **Mirza Zulfan** - *Logo Design* - https://github.com/mirzazulfan
86+
87+
88+
LICENSE
89+
=======
90+
Aura framework is licensed under the **GPL-3.0**.
91+
Datasets produced from global scans using Aura are released under the **CC BY-NC 4.0** license.
92+
Use the following citation when using Aura or data produced by Aura in research:
93+
94+
::
95+
96+
@misc{Carnogursky2019thesis,
97+
AUTHOR = "CARNOGURSKY, Martin",
98+
TITLE = "Attacks on package managers [online]",
99+
YEAR = "2019 [cit. 2020-11-02]",
100+
TYPE = "Bachelor Thesis",
101+
SCHOOL = "Masaryk University, Faculty of Informatics, Brno",
102+
SUPERVISOR = "Vit Bukac",
103+
URL = "Available at WWW <https://is.muni.cz/th/y41ft/>",
104+
}

aura/cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,16 +220,16 @@ def update_aura():
220220

221221

222222
@cli.command()
223-
@click.option("-o", "--out", default="-", type=click.File("w"))
224223
@click.option("-m", "--max-distance", default=2, type=click.IntRange(min=0, max=10))
225224
@click.option("-l", "--limit", default=100, type=click.INT)
225+
@click.option("-f", "--format", default="text")
226226
@click.argument("pkg", nargs=-1)
227-
def find_typosquatting(out, max_distance, limit=100, pkg=None):
227+
def find_typosquatting(max_distance, limit=100, pkg=None, format="text"):
228228
if limit <= 0:
229229
click.secho("Invalid value for limit", file=sys.stderr)
230230
sys.exit(1)
231231

232-
commands.generate_typosquatting(out=out, distance=max_distance, limit=limit, pkgs=pkg)
232+
commands.generate_typosquatting(distance=max_distance, limit=limit, pkgs=pkg, format_uri=format)
233233

234234

235235
@cli.command()

aura/commands.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from concurrent import futures
99
from pathlib import Path
1010
from functools import partial
11+
from itertools import islice
1112
from typing import Union, Optional, Tuple, Generator, List
1213

1314
import click
@@ -21,9 +22,8 @@
2122
from . import utils
2223
from . import mirror
2324
from . import typos
24-
from .package import PypiPackage
2525
from .analyzers.detections import Detection
26-
from .output.base import ScanOutputBase, DiffOutputBase, InfoOutputBase
26+
from .output.base import ScanOutputBase, DiffOutputBase, InfoOutputBase, TyposquattingOutputBase
2727

2828

2929
logger = config.get_logger(__name__)
@@ -214,28 +214,15 @@ def show_info():
214214
formatter.output_info_data(info_data)
215215

216216

217-
def generate_typosquatting(out, distance=2, limit=None, pkgs=None):
218-
from .output.text import PrettyReport
219-
220-
p = PrettyReport() # TODO: convert into plugin system
221-
217+
def generate_typosquatting(distance=2, limit=None, pkgs=None, format_uri="text"):
222218
if not pkgs:
223219
pkgs = typos.get_popular_packages()
224220

225221
f = partial(typos.damerau_levenshtein, max_distance=distance)
226222
combinations = typos.generate_combinations(left=pkgs)
227223

228-
for idx, data in enumerate(typos.enumerator(combinations, f)):
229-
if limit and idx >= limit:
230-
break
231-
232-
try:
233-
diff_table = data["orig_pkg"]._cmp_info(data["typo_pkg"])
234-
t1 = data["orig_score"].get_score_table()
235-
t2 = data["typo_score"].get_score_table()
236-
p.print_tables(t1, t2, diff_table)
237-
except exceptions.NoSuchPackage:
238-
continue
224+
formatter = TyposquattingOutputBase.from_uri(format_uri)
225+
formatter.output_typosquatting(islice(typos.enumerator(combinations, f), 0, limit))
239226

240227

241228
def prefetch(*uris):

aura/output/base.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def get_format(cls, uri: str, parsed=None) -> OutputBase:
5555
elif fmt.is_supported(parsed_uri=parsed):
5656
return fmt
5757

58-
raise exceptions.InvalidOutput("No such output format")
58+
raise exceptions.InvalidOutput(f"No such output format `{uri}`")
5959

6060

6161
@dataclass()
@@ -165,6 +165,21 @@ def filtered(self, hits):
165165
return processed
166166

167167

168+
@dataclass()
169+
class TyposquattingOutputBase(OutputBase, metaclass=ABCMeta):
170+
@classmethod
171+
def entrypoint(cls) -> str:
172+
return "aura.typosquatting_output_handlers"
173+
174+
@classmethod
175+
def from_uri(cls, uri: str) -> TyposquattingOutputBase:
176+
return cls.get_format(uri)()
177+
178+
@abstractmethod
179+
def output_typosquatting(self, entries):
180+
...
181+
182+
168183
@dataclass()
169184
class DiffOutputBase(OutputBase, metaclass=ABCMeta):
170185
detections: bool = True

aura/output/json.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from dataclasses import dataclass
22
from typing import Any
33

4-
from .base import ScanOutputBase, DiffOutputBase
4+
from .base import ScanOutputBase, DiffOutputBase, TyposquattingOutputBase
55
from ..type_definitions import DiffType, DiffAnalyzerType
66
from ..json_proxy import dumps
77

@@ -97,3 +97,19 @@ def output_diff(self, diff_analyzer: DiffAnalyzerType):
9797
payload["diffs"].append(diff)
9898

9999
print(dumps(payload), file=self._fd)
100+
101+
102+
class JSONTyposquattingOutput(TyposquattingOutputBase):
103+
@classmethod
104+
def protocol(cls) -> str:
105+
return "json"
106+
107+
def output_typosquatting(self, entries):
108+
for x in entries:
109+
data = {
110+
"original": x["original"],
111+
"typosquatting": x["typo"],
112+
"original_score": x["orig_score"].get_score_matrix(),
113+
"typosquatting_score": x["typo_score"].get_score_matrix(),
114+
}
115+
print(dumps(data))

aura/output/text.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
from dataclasses import dataclass
77
from textwrap import wrap
88
from prettyprinter import pformat
9-
from typing import Optional, Any
9+
from typing import Optional, Any, Generator
1010
from collections import Counter
1111

1212
from click import secho, style
1313

1414
from .. import utils
1515
from .. import config
1616
from ..analyzers.detections import get_severity
17-
from .base import ScanOutputBase, DiffOutputBase, InfoOutputBase
17+
from .base import ScanOutputBase, DiffOutputBase, InfoOutputBase, TyposquattingOutputBase
1818
from .table import Table
1919

2020

@@ -429,6 +429,20 @@ def output_info_data(self, data):
429429
out.print_bottom_separator()
430430

431431

432+
class TextTyposquattingOutput(TyposquattingOutputBase):
433+
@classmethod
434+
def protocol(cls) -> str:
435+
return "text"
436+
437+
def output_typosquatting(self, entries):
438+
out = PrettyReport()
439+
for x in entries:
440+
diff_table = x["orig_pkg"]._cmp_info(x["typo_pkg"])
441+
orig_table = x["orig_score"].get_score_table()
442+
typo_table = x["typo_score"].get_score_table()
443+
out.print_tables(orig_table, typo_table, diff_table)
444+
445+
432446
@dataclass()
433447
class TextDiffOutput(TextBase, DiffOutputBase):
434448
_fd: Any = None

aura/typos.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,10 +201,9 @@ def enumerator(
201201
pkg_cache = {}
202202
pkg_score_cache = {}
203203

204-
for num, (orig, typo) in enumerate(generator):
204+
for (orig, typo) in generator:
205205
res = method(orig, typo)
206206
if res and res < 2:
207-
208207
if orig not in pkg_cache:
209208
orig_pkg = package.PypiPackage.from_pypi(orig)
210209
pkg_cache[orig] = orig_pkg

docs/source/cookbook/misc/global_pypi_scan.rst

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ The PyPI scan itself is done on a separate high-performance worker node. While i
1717

1818
The worker node has a fast SSD disk dedicated to caching the packages for the scan that are prefetched from the offline PyPI mirror right before the scan starts. After the prefetch is completed a full scan of all packages is conducted by running parallel Aura scans. All scripts used on the worker node are available under the ``files/`` directory at the root of the Aura repository.
1919

20+
The full list of published PyPI datasets is available at: https://cdn.sourcecode.ai/pypi_datasets/index/datasets.html
2021

2122
Technical specification of the worker node:
2223

@@ -25,5 +26,37 @@ CPU AMD Ryzen 9 3900X 12-Core Processor
2526
RAM HyperX 32GB Kit DDR4 3200MHz CL16 XMP
2627
GPU SAPPHIRE NITRO+ Radeon RX 580 OC 8G
2728
Disk 2x Intel 660p M.2 2TB SSD NVMe
28-
OS Windows 10 with Aura running inside WSL2 Ubuntu 18.04
29+
OS Arch Linux (fully updated prior to scan)
2930
===== =====
31+
32+
33+
Description of the dataset
34+
--------------------------
35+
36+
Data produced from global scans are distributed via magnet (torrent) links with metadata hosted on SourceCode.AI CDN. The dataset content is as follows:
37+
38+
- **dataset.zst** - Single file dataset compressed using `ZSTD <https://facebook.github.io/zstd/>`_. Each line contains a compact JSON per scanned PyPI package
39+
- **joblog.txt** - Joblog file from GNU Parallels
40+
- **input_packages.txt** - List of PyPI packages passed as input for the global PyPI scan
41+
- **package_list.txt** - List of PyPI packages actually processed by Aura during the scan, each package listed in this file has an entry in a dataset.zst file
42+
- **checksums.md5.txt** - List of MD5 checksums for all files contained within the dataset
43+
- **README.txt** - License & copy of this description
44+
45+
You may have noted that there is a difference between the file ``input_packages.txt`` and ``package_list.txt``. The input file is generally larger and is of all packages contained in our offline PyPI mirror at the start of a global scan. However, some packages may have not any releases published and so they would be skipped by Aura during the actual scan. Other reasons may include that the package has a corrupted archive, timeout for a scan has been reached or Aura crashed during the scan of a package. This is the reason why the input package list is always larger than the actual list produced by Aura during/after the scan.
46+
47+
To quickly process or glance at the data, we highly recommend to use the `jq data processor <https://stedolan.github.io/jq/>`_ .
48+
49+
The dataset is released under the `CC BY-NC 4.0 license <https://creativecommons.org/licenses/by-nc/4.0/>`_ .
50+
Use the following citation to give attribution to the original research paper:
51+
52+
::
53+
54+
@misc{Carnogursky2019thesis,
55+
AUTHOR = "CARNOGURSKY, Martin",
56+
TITLE = "Attacks on package managers [online]",
57+
YEAR = "2019 [cit. 2020-11-02]",
58+
TYPE = "Bachelor Thesis",
59+
SCHOOL = "Masaryk University, Faculty of Informatics, Brno",
60+
SUPERVISOR = "Vit Bukac",
61+
URL = "Available at WWW <https://is.muni.cz/th/y41ft/>",
62+
}

pyproject.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ sqlite = "aura.output.sqlite:SQLiteDiffOutput"
109109
[tool.poetry.plugins."aura.info_output_handlers"]
110110
text = "aura.output.text:TextInfoOutput"
111111

112+
[tool.poetry.plugins."aura.typosquatting_output_handlers"]
113+
text = "aura.output.text:TextTyposquattingOutput"
114+
json = "aura.output.json:JSONTyposquattingOutput"
115+
112116
[tool.poetry.plugins."aura.diff_hooks"]
113117
diff_archive = "aura.analyzers.archive:diff_archive"
114118

tests/test_typos.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def test_distance():
2121

2222

2323
@patch("aura.typos.get_all_pypi_packages")
24-
def disabled_test_typosquatting_generator(mock, tmp_path, mock_pypi_stats): # FIXME
24+
def disable_test_typosquatting_generator(mock, tmp_path, mock_pypi_stats): # FIXME
2525
stats: Path = tmp_path / "pypi_stats.json"
2626
stats.write_text("\n".join(json.dumps(x) for x in config.iter_pypi_stats()))
2727
os.environ["AURA_PYPI_STATS"] = str(stats)
@@ -35,10 +35,10 @@ def disabled_test_typosquatting_generator(mock, tmp_path, mock_pypi_stats): # F
3535
'grequest',
3636
]
3737

38-
runner = CliRunner()
38+
runner = CliRunner(mix_stderr=False)
3939
result = runner.invoke(
4040
cli.cli,
41-
['find-typosquatting', '--max-distance', '1', '--limit', '10' ]
41+
['find-typosquatting', '--limit', '10', '-f', 'json'],
4242
)
4343
if result.exception:
4444
raise result.exception

0 commit comments

Comments
 (0)