Skip to content

Commit 30240b1

Browse files
chore: add output samples and ignore partial download archives
1 parent 917cb64 commit 30240b1

54 files changed

Lines changed: 4795 additions & 240 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,3 +24,6 @@ __pycache__/
2424
*.pyc
2525

2626
tmp/
27+
28+
# Partial download artifacts from receipt dataset acquisition
29+
models/data/downloaded_*/**/archives/*.part-*

.prettierignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ coverage
44
playwright-report
55
*.lock
66
CHANGELOG.md
7+
models

models/data/downloaded_images/ara/mahmoud2019-receiptqa/archives/test_images.zip.part-8492-1772824074805-jl3fjl9n

Whitespace-only changes.

models/data/downloaded_images/receipt-license-manifest.json

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
"title": "mahmoud2019/ReceiptQA",
1919
"sourceType": "huggingface",
2020
"language": "ara",
21-
"languages": [
22-
"ara"
23-
],
21+
"languages": ["ara"],
2422
"license": "MIT",
2523
"evidencePath": "licenses/ara/mahmoud2019-receiptqa.json",
2624
"totalAvailable": 6,
@@ -754,9 +752,7 @@
754752
"title": "Lakshmiperumal/scanned_receipts",
755753
"sourceType": "huggingface",
756754
"language": "eng",
757-
"languages": [
758-
"eng"
759-
],
755+
"languages": ["eng"],
760756
"license": "CC-BY-4.0",
761757
"evidencePath": "licenses/eng/lakshmiperumal-scanned-receipts.json",
762758
"totalAvailable": 713,
@@ -1562,19 +1558,15 @@
15621558
"mediaType": "application/octet-stream"
15631559
}
15641560
],
1565-
"skipped": [
1566-
"Limit reached (80 of 713 files)."
1567-
],
1561+
"skipped": ["Limit reached (80 of 713 files)."],
15681562
"notes": "Scanned receipt images from the FiftyOne mirror."
15691563
},
15701564
{
15711565
"sourceId": "Saran-R12/Receipts",
15721566
"title": "Saran-R12/Receipts",
15731567
"sourceType": "huggingface",
15741568
"language": "eng",
1575-
"languages": [
1576-
"eng"
1577-
],
1569+
"languages": ["eng"],
15781570
"license": "Apache-2.0",
15791571
"evidencePath": "licenses/eng/saran-r12-receipts.json",
15801572
"totalAvailable": 1,
@@ -1948,9 +1940,7 @@
19481940
"title": "Voxel51/scanned_receipts",
19491941
"sourceType": "huggingface",
19501942
"language": "eng",
1951-
"languages": [
1952-
"eng"
1953-
],
1943+
"languages": ["eng"],
19541944
"license": "CC-BY-4.0",
19551945
"evidencePath": "licenses/eng/voxel51-scanned-receipts.json",
19561946
"totalAvailable": 713,
@@ -2756,19 +2746,15 @@
27562746
"mediaType": "application/octet-stream"
27572747
}
27582748
],
2759-
"skipped": [
2760-
"Limit reached (80 of 713 files)."
2761-
],
2749+
"skipped": ["Limit reached (80 of 713 files)."],
27622750
"notes": "Primarily English scanned receipt images from the SROIE mirror."
27632751
},
27642752
{
27652753
"sourceId": "https://zenodo.org/records/13688441/files/Dataset.zip?download=1",
27662754
"title": "Zenodo hand-captured restaurant receipts",
27672755
"sourceType": "direct",
27682756
"language": "eng",
2769-
"languages": [
2770-
"eng"
2771-
],
2757+
"languages": ["eng"],
27722758
"license": "CC-BY-4.0",
27732759
"evidencePath": "licenses/eng/zenodo-hand-captured-restaurant-receipts.json",
27742760
"totalAvailable": 1,
@@ -3502,9 +3488,7 @@
35023488
"title": "Voxel51/consolidated_receipt_dataset",
35033489
"sourceType": "huggingface",
35043490
"language": "ind",
3505-
"languages": [
3506-
"ind"
3507-
],
3491+
"languages": ["ind"],
35083492
"license": "CC-BY-4.0",
35093493
"evidencePath": "licenses/ind/voxel51-consolidated-receipt-dataset.json",
35103494
"totalAvailable": 801,
@@ -4310,19 +4294,15 @@
43104294
"mediaType": "application/octet-stream"
43114295
}
43124296
],
4313-
"skipped": [
4314-
"Limit reached (80 of 801 files)."
4315-
],
4297+
"skipped": ["Limit reached (80 of 801 files)."],
43164298
"notes": "Indonesian receipt dataset mirrored through FiftyOne."
43174299
},
43184300
{
43194301
"sourceId": "HumynLabs/Korean_Receipts_Dataset",
43204302
"title": "HumynLabs/Korean_Receipts_Dataset",
43214303
"sourceType": "huggingface",
43224304
"language": "kor",
4323-
"languages": [
4324-
"kor"
4325-
],
4305+
"languages": ["kor"],
43264306
"license": "CC-BY-4.0",
43274307
"evidencePath": "licenses/kor/humynlabs-korean-receipts-dataset.json",
43284308
"totalAvailable": 20,
@@ -4536,9 +4516,7 @@
45364516
"title": "cdek-ocr/receipt-ocr-ru",
45374517
"sourceType": "huggingface",
45384518
"language": "rus",
4539-
"languages": [
4540-
"rus"
4541-
],
4519+
"languages": ["rus"],
45424520
"license": "MIT",
45434521
"evidencePath": "licenses/rus/cdek-ocr-receipt-ocr-ru.json",
45444522
"totalAvailable": 988,
@@ -5344,19 +5322,15 @@
53445322
"mediaType": "application/octet-stream"
53455323
}
53465324
],
5347-
"skipped": [
5348-
"Limit reached (80 of 988 files)."
5349-
],
5325+
"skipped": ["Limit reached (80 of 988 files)."],
53505326
"notes": "Russian receipt OCR image dataset."
53515327
},
53525328
{
53535329
"sourceId": "CC1984/mall_receipt_extraction_dataset",
53545330
"title": "CC1984/mall_receipt_extraction_dataset",
53555331
"sourceType": "huggingface",
53565332
"language": "zho",
5357-
"languages": [
5358-
"zho"
5359-
],
5333+
"languages": ["zho"],
53605334
"license": "MIT",
53615335
"evidencePath": "licenses/zho/cc1984-mall-receipt-extraction-dataset.json",
53625336
"totalAvailable": 994,
@@ -6162,9 +6136,7 @@
61626136
"mediaType": "application/octet-stream"
61636137
}
61646138
],
6165-
"skipped": [
6166-
"Limit reached (80 of 994 files)."
6167-
],
6139+
"skipped": ["Limit reached (80 of 994 files)."],
61686140
"notes": "Chinese mall receipt image dataset."
61696141
}
61706142
]
Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
# Receipt Download Report
2+
23
Generated: 2026-03-06T19:25:10.171Z
34

45
Output: models\data\downloaded_images
56
Scope: limited to 80
67

7-
| Language | Source | License | Files | Notes | Evidence |
8-
|---|---|---|---:|---|---|
9-
| ara | mahmoud2019/ReceiptQA | MIT | 80 | Arabic-dominant receipt question-answering dataset with many Egyptian retail receipts and mixed Arabic-English text. | licenses/ara/mahmoud2019-receiptqa.json |
10-
| eng | Lakshmiperumal/scanned_receipts | CC-BY-4.0 | 80 | Scanned receipt images from the FiftyOne mirror. | licenses/eng/lakshmiperumal-scanned-receipts.json |
11-
| eng | Saran-R12/Receipts | Apache-2.0 | 40 | Receipt image dataset distributed as archives on Hugging Face. | licenses/eng/saran-r12-receipts.json |
12-
| eng | Voxel51/scanned_receipts | CC-BY-4.0 | 80 | Primarily English scanned receipt images from the SROIE mirror. | licenses/eng/voxel51-scanned-receipts.json |
13-
| eng | Zenodo hand-captured restaurant receipts | CC-BY-4.0 | 80 | Hand-captured English restaurant receipt dataset distributed from Zenodo. | licenses/eng/zenodo-hand-captured-restaurant-receipts.json |
14-
| ind | Voxel51/consolidated_receipt_dataset | CC-BY-4.0 | 80 | Indonesian receipt dataset mirrored through FiftyOne. | licenses/ind/voxel51-consolidated-receipt-dataset.json |
15-
| kor | HumynLabs/Korean_Receipts_Dataset | CC-BY-4.0 | 20 | Korean receipt image dataset. | licenses/kor/humynlabs-korean-receipts-dataset.json |
16-
| rus | cdek-ocr/receipt-ocr-ru | MIT | 80 | Russian receipt OCR image dataset. | licenses/rus/cdek-ocr-receipt-ocr-ru.json |
17-
| zho | CC1984/mall_receipt_extraction_dataset | MIT | 80 | Chinese mall receipt image dataset. | licenses/zho/cc1984-mall-receipt-extraction-dataset.json |
18-
8+
| Language | Source | License | Files | Notes | Evidence |
9+
| -------- | ---------------------------------------- | ---------- | ----: | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
10+
| ara | mahmoud2019/ReceiptQA | MIT | 80 | Arabic-dominant receipt question-answering dataset with many Egyptian retail receipts and mixed Arabic-English text. | licenses/ara/mahmoud2019-receiptqa.json |
11+
| eng | Lakshmiperumal/scanned_receipts | CC-BY-4.0 | 80 | Scanned receipt images from the FiftyOne mirror. | licenses/eng/lakshmiperumal-scanned-receipts.json |
12+
| eng | Saran-R12/Receipts | Apache-2.0 | 40 | Receipt image dataset distributed as archives on Hugging Face. | licenses/eng/saran-r12-receipts.json |
13+
| eng | Voxel51/scanned_receipts | CC-BY-4.0 | 80 | Primarily English scanned receipt images from the SROIE mirror. | licenses/eng/voxel51-scanned-receipts.json |
14+
| eng | Zenodo hand-captured restaurant receipts | CC-BY-4.0 | 80 | Hand-captured English restaurant receipt dataset distributed from Zenodo. | licenses/eng/zenodo-hand-captured-restaurant-receipts.json |
15+
| ind | Voxel51/consolidated_receipt_dataset | CC-BY-4.0 | 80 | Indonesian receipt dataset mirrored through FiftyOne. | licenses/ind/voxel51-consolidated-receipt-dataset.json |
16+
| kor | HumynLabs/Korean_Receipts_Dataset | CC-BY-4.0 | 20 | Korean receipt image dataset. | licenses/kor/humynlabs-korean-receipts-dataset.json |
17+
| rus | cdek-ocr/receipt-ocr-ru | MIT | 80 | Russian receipt OCR image dataset. | licenses/rus/cdek-ocr-receipt-ocr-ru.json |
18+
| zho | CC1984/mall_receipt_extraction_dataset | MIT | 80 | Chinese mall receipt image dataset. | licenses/zho/cc1984-mall-receipt-extraction-dataset.json |

models/data/downloaded_pdfs/receipt-license-manifest.json

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
"title": "prithivMLmods/Openpdf-MultiReceipt-1K",
1919
"sourceType": "huggingface",
2020
"language": "deu",
21-
"languages": [
22-
"deu"
23-
],
21+
"languages": ["deu"],
2422
"license": "Apache-2.0",
2523
"evidencePath": "licenses/deu/prithivmlmods-openpdf-multireceipt-1k.json",
2624
"totalAvailable": 996,
@@ -826,9 +824,7 @@
826824
"mediaType": "application/octet-stream"
827825
}
828826
],
829-
"skipped": [
830-
"Limit reached (80 of 996 files)."
831-
],
827+
"skipped": ["Limit reached (80 of 996 files)."],
832828
"notes": "Multi-receipt PDF dataset with a strongly German receipt pool."
833829
}
834830
]
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# Receipt Download Report
2+
23
Generated: 2026-03-06T19:25:10.098Z
34

45
Output: models\data\downloaded_pdfs
56
Scope: limited to 80
67

7-
| Language | Source | License | Files | Notes | Evidence |
8-
|---|---|---|---:|---|---|
9-
| deu | prithivMLmods/Openpdf-MultiReceipt-1K | Apache-2.0 | 80 | Multi-receipt PDF dataset with a strongly German receipt pool. | licenses/deu/prithivmlmods-openpdf-multireceipt-1k.json |
10-
8+
| Language | Source | License | Files | Notes | Evidence |
9+
| -------- | ------------------------------------- | ---------- | ----: | -------------------------------------------------------------- | ------------------------------------------------------- |
10+
| deu | prithivMLmods/Openpdf-MultiReceipt-1K | Apache-2.0 | 80 | Multi-receipt PDF dataset with a strongly German receipt pool. | licenses/deu/prithivmlmods-openpdf-multireceipt-1k.json |

models/data/downloaded_texts/eng/receiptline-examples/files/example/js/en.html

Lines changed: 48 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -14,53 +14,59 @@
1414
limitations under the License.
1515
-->
1616

17-
<!DOCTYPE html>
17+
<!doctype html>
1818
<html>
19-
<head>
20-
<meta charset="utf-8">
19+
<head>
20+
<meta charset="utf-8" />
2121
<title>ReceiptLine JavaScript Example</title>
2222
<script type="text/javascript" src="receiptline.js"></script>
2323
<script type="text/javascript">
24-
function initialize() {
25-
const load = document.getElementById('load');
26-
const paper = document.getElementById('paper');
27-
load.onclick = event => load.value = '';
28-
load.onchange = event => {
29-
const file = event.target.files[0];
30-
if (file) {
31-
const reader = new FileReader();
32-
reader.onload = event => {
33-
// for SVG output
34-
const printer = {
35-
// cpl: characters per line (required)
36-
cpl: 48,
37-
// font: Courier, encoding: utf-8 (optional)
38-
encoding: 'multilingual',
39-
// upsideDown: ignored (optional)
40-
upsideDown: false,
41-
// spacing: line spacing (optional)
42-
spacing: false,
43-
// cutting: ignored (optional)
44-
cutting: true,
45-
// gamma: ignored (optional)
46-
gamma: 1.0,
47-
// command: SVG (optional)
48-
command: 'svg'
49-
};
50-
const svg = receiptline.transform(reader.result, printer);
51-
const dom = new DOMParser().parseFromString(svg, 'image/svg+xml').documentElement;
52-
paper.replaceChildren(dom);
53-
}
54-
reader.readAsText(file);
55-
}
56-
};
24+
function initialize() {
25+
const load = document.getElementById('load')
26+
const paper = document.getElementById('paper')
27+
load.onclick = (event) => (load.value = '')
28+
load.onchange = (event) => {
29+
const file = event.target.files[0]
30+
if (file) {
31+
const reader = new FileReader()
32+
reader.onload = (event) => {
33+
// for SVG output
34+
const printer = {
35+
// cpl: characters per line (required)
36+
cpl: 48,
37+
// font: Courier, encoding: utf-8 (optional)
38+
encoding: 'multilingual',
39+
// upsideDown: ignored (optional)
40+
upsideDown: false,
41+
// spacing: line spacing (optional)
42+
spacing: false,
43+
// cutting: ignored (optional)
44+
cutting: true,
45+
// gamma: ignored (optional)
46+
gamma: 1.0,
47+
// command: SVG (optional)
48+
command: 'svg',
49+
}
50+
const svg = receiptline.transform(reader.result, printer)
51+
const dom = new DOMParser().parseFromString(
52+
svg,
53+
'image/svg+xml'
54+
).documentElement
55+
paper.replaceChildren(dom)
56+
}
57+
reader.readAsText(file)
58+
}
5759
}
60+
}
5861
</script>
59-
</head>
60-
<body onload="initialize()" style="background-color: #eee;">
62+
</head>
63+
<body onload="initialize()" style="background-color: #eee">
6164
<label for="load">Load</label>
62-
<input id="load" type="file" accept=".receipt,text/plain">
63-
<hr>
64-
<div id="paper" style="width: 576px; padding: 12px; background-color: #fff;"></div>
65-
</body>
65+
<input id="load" type="file" accept=".receipt,text/plain" />
66+
<hr />
67+
<div
68+
id="paper"
69+
style="width: 576px; padding: 12px; background-color: #fff"
70+
></div>
71+
</body>
6672
</html>

models/data/downloaded_texts/receipt-license-manifest.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,7 @@
1818
"title": "receiptline/receiptline examples",
1919
"sourceType": "github",
2020
"language": "eng",
21-
"languages": [
22-
"eng"
23-
],
21+
"languages": ["eng"],
2422
"license": "Apache-2.0",
2523
"evidencePath": "licenses/eng/receiptline-examples.json",
2624
"totalAvailable": 22,
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
# Receipt Download Report
2+
23
Generated: 2026-03-06T19:25:10.114Z
34

45
Output: models\data\downloaded_texts
56
Scope: full
67

7-
| Language | Source | License | Files | Notes | Evidence |
8-
|---|---|---|---:|---|---|
9-
| eng | receiptline/receiptline examples | Apache-2.0 | 22 | Apache-2.0 English receipt and order text examples, including .receipt and HTML documents. | licenses/eng/receiptline-examples.json |
10-
8+
| Language | Source | License | Files | Notes | Evidence |
9+
| -------- | -------------------------------- | ---------- | ----: | ------------------------------------------------------------------------------------------ | -------------------------------------- |
10+
| eng | receiptline/receiptline examples | Apache-2.0 | 22 | Apache-2.0 English receipt and order text examples, including .receipt and HTML documents. | licenses/eng/receiptline-examples.json |

0 commit comments

Comments
 (0)