Skip to content

Commit ebf7220

Browse files
authored
Merge pull request #1 from dacort/feature/parquet
Initial Parquet support
2 parents a5eade0 + 4b9938a commit ebf7220

7 files changed

Lines changed: 344 additions & 24 deletions

File tree

.github/workflows/ci.yml

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
name: Faker CLI Test Suite
2+
on: [push]
3+
jobs:
4+
pytest:
5+
strategy:
6+
matrix:
7+
os: [ubuntu-latest, windows-latest, macos-latest]
8+
python-version: ["3.9", "3.10", "3.11"]
9+
runs-on: ${{ matrix.os }}
10+
defaults:
11+
run:
12+
shell: bash
13+
steps:
14+
- uses: actions/checkout@v3
15+
-
16+
name: Set up Python
17+
id: setup-python
18+
uses: actions/setup-python@v4
19+
with:
20+
python-version: ${{ matrix.python-version }}
21+
-
22+
name: cache poetry install
23+
uses: actions/cache@v3
24+
with:
25+
path: ~/.local
26+
key: ${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-poetry-1.3.2-0
27+
-
28+
name: Install Poetry
29+
uses: snok/install-poetry@v1
30+
with:
31+
virtualenvs-create: true
32+
virtualenvs-in-project: true
33+
installer-parallel: true
34+
-
35+
name: Load cached venv
36+
id: cached-poetry-dependencies
37+
uses: actions/cache@v3
38+
with:
39+
path: .venv
40+
key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
41+
-
42+
name: Install dependencies
43+
if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
44+
run: |
45+
poetry install --no-interaction --no-root
46+
-
47+
name: Install project
48+
run: |
49+
poetry install --no-interaction
50+
-
51+
name: Run tests
52+
run: |
53+
source $VENV
54+
poetry run pytest -v

README.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
Faker is an awesome Python library, but I often just want a simple command I can run to generate data in a variety of formats.
44

5-
With Faker CLI, you can easily generate CSV or JSON data with fields of your choosing.
5+
With Faker CLI, you can easily generate CSV, JSON, or Parquet data with fields of your choosing.
66

77
You can also utilize pre-built templates for common data formats!
88

@@ -70,6 +70,20 @@ fake -n 10 pyint,user_name,date_this_year -f json -c id,awesome_name,last_attent
7070
{"id": 1967, "awesome_name": "jmendoza", "last_attention_at": "2023-01-23"}
7171
```
7272

73+
### Parquet
74+
75+
OK, it had to happen, you can even write Parquet.
76+
77+
```bash
78+
fake -n 10 pyint,user_name,date_this_year -f parquet -o sample.parquet
79+
```
80+
81+
_youcanevenwritestraighttos3_ 🤭
82+
83+
```bash
84+
fake -n 10 pyint,user_name,date_this_year -f parquet -o s3://YOUR_BUCKET/data/sample.parquet
85+
```
86+
7387
## Templates
7488

7589
Want to generate 1 MILLION S3 Access logs in ~2 minutes? Now you can.

faker_cli/cli.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import sys
44
from faker_cli.templates import CloudFrontWriter, S3AccessLogs, S3AccessWriter, CloudTrailLogs, CloudFrontLogs
55

6-
from faker_cli.writer import CSVWriter, JSONWriter
6+
from faker_cli.writer import CSVWriter, JSONWriter, ParquetWriter
77
from typing import List
88

99
def infer_column_names(col_names, col_types: str) -> List[str]:
@@ -18,7 +18,8 @@ def infer_column_names(col_names, col_types: str) -> List[str]:
1818

1919
KLAS_MAPPER = {
2020
"csv": CSVWriter,
21-
"json": JSONWriter
21+
"json": JSONWriter,
22+
"parquet": ParquetWriter,
2223
}
2324

2425
TEMPLATE_MAPPER = {
@@ -32,11 +33,12 @@ def infer_column_names(col_names, col_types: str) -> List[str]:
3233

3334
@click.command()
3435
@click.option("--num-rows", "-n", default=1, help="Number of rows")
35-
@click.option("--format", "-f", type=click.Choice(["csv", "json"]), default="csv", help="Format of the output")
36+
@click.option("--format", "-f", type=click.Choice(["csv", "json", "parquet"]), default="csv", help="Format of the output")
37+
@click.option("--output", "-o", type=click.Path(writable=True))
3638
@click.option("--columns", "-c", help="Column names", default=None, required=False)
3739
@click.option("--template", "-t", help="Template to use", type=click.Choice(["s3access", "cloudfront"]), default=None)
3840
@click.argument("column_types", required=False)
39-
def main(num_rows, format, columns, template, column_types):
41+
def main(num_rows, format, output, columns, template, column_types):
4042
"""
4143
Generate fake data, easily.
4244
@@ -45,13 +47,23 @@ def main(num_rows, format, columns, template, column_types):
4547
4648
You can also use --template for real-world synthetic data.
4749
"""
50+
# Do some initial validation - we must have either template or column tpes
4851
if not template and not column_types:
4952
ctx = click.get_current_context()
5053
click.echo(ctx.get_help())
5154
ctx.exit()
5255
raise click.BadArgumentUsage(
5356
"either --template or a list of Faker property names must be provided."
5457
)
58+
59+
# Parquet output requires a filename
60+
if format == "parquet" and output is None:
61+
raise click.BadArgumentUsage("parquet format requires --output/-o filename parameter.")
62+
if output is not None and format != "parquet":
63+
raise click.BadArgumentUsage("output files not supported for csv/json yet.")
64+
65+
# If the user provides a template, we use that provider and writer and exit.
66+
# We assume a template has a custom writer that may be different than CSV or JSON
5567
if template:
5668
writer = TEMPLATE_MAPPER[template][0](sys.stdout, None)
5769
log_entry = TEMPLATE_MAPPER[template][1]
@@ -60,20 +72,12 @@ def main(num_rows, format, columns, template, column_types):
6072
writer.write(row)
6173
return
6274

75+
# Now, if a template hasn't been provided, generate some fake data!
6376
col_types = column_types.split(",")
6477
headers = infer_column_names(columns, column_types)
65-
writer = KLAS_MAPPER.get(format)(sys.stdout, headers)
78+
writer = KLAS_MAPPER.get(format)(sys.stdout, headers, output)
6679
for i in range(num_rows):
6780
# TODO: Handle args
6881
row = [ fake.format(ctype) for ctype in col_types ]
6982
writer.write(row)
70-
# Convert columns to templates
71-
# if format == "csv":
72-
# column_types = [f"{{{{{x}}}}}" for x in column_types.split(',')]
73-
# print(fake.csv(data_columns=(column_types), num_rows=num_rows))
74-
# elif format == "json":
75-
# # convert column_types into a dict
76-
# cols = column_types.split(",")
77-
# column_def = dict(zip(cols, cols))
78-
# print(fake.json(data_columns=column_def, num_rows=num_rows))
79-
83+
writer.close()

faker_cli/writer.py

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,58 @@
11
import csv
22
import json
3+
from typing import Optional
4+
import pyarrow as pa
5+
import pyarrow.parquet as pq
6+
37

48
class Writer:
5-
def __init__(self, output, headers):
9+
def __init__(self, output, headers, filename: Optional[str] = None):
610
self.output = output
711
self.headers = headers
812
self.writer = None
913

1014
def write(self, row):
1115
pass
12-
16+
1317
def close(self):
14-
self.writer.close()
18+
pass
19+
1520

1621
class CSVWriter(Writer):
17-
def __init__(self, output, headers):
22+
def __init__(self, output, headers, filename):
1823
super().__init__(output, headers)
1924
self.writer = csv.writer(self.output)
2025
self.write(headers)
2126

2227
def write(self, row):
2328
self.writer.writerow(row)
2429

30+
2531
class JSONWriter(Writer):
26-
def __init__(self, output, headers):
32+
def __init__(self, output, headers, filename):
2733
super().__init__(output, headers)
2834
self.writer = self.output
2935

3036
def write(self, row):
3137
jsonl = json.dumps(dict(zip(self.headers, row)), default=str)
3238
self.writer.write(jsonl)
33-
self.writer.write('\n')
39+
self.writer.write("\n")
40+
41+
42+
class ParquetWriter(Writer):
43+
def __init__(self, output, headers, filename):
44+
super().__init__(output, headers)
45+
self.filename = filename
46+
self.table: pa.Table = None
47+
48+
def write(self, row):
49+
ini_dict = [{k: [v]} for k, v in list(zip(self.headers, row))]
50+
tbl = {k: v for d in ini_dict for k, v in d.items()}
51+
table = pa.table(tbl)
52+
if self.table is None:
53+
self.table = table
54+
else:
55+
self.table = pa.concat_tables([self.table, table])
56+
57+
def close(self):
58+
pq.write_table(self.table, self.filename)

0 commit comments

Comments
 (0)