Skip to content

Commit 7b4c4ed

Browse files
committed
feat(stats): include code blocks in word counting
1 parent 8e0a368 commit 7b4c4ed

7 files changed

Lines changed: 47 additions & 14 deletions

File tree

api/ingestors/note_ingestor.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
IMG_SRC_PATTERN = re.compile(r'<img\s+[^>]*src="([^"]+)"', re.IGNORECASE)
4444
HTML_IMG_TAG_PATTERN = re.compile(r"<img\s+[^>]*>", re.IGNORECASE)
4545
MARKDOWN_IMG_PATTERN = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
46-
CODE_BLOCK_PATTERN = re.compile(r"```[\s\S]*?```")
47-
INLINE_CODE_PATTERN = re.compile(r"`[^`\n]*`")
46+
CODE_BLOCK_PATTERN = re.compile(r"```[^\n]*\n([\s\S]*?)```")
47+
INLINE_CODE_PATTERN = re.compile(r"`([^`\n]*)`")
4848
LINK_PATTERN = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
4949
HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
5050
MATH_BLOCK_PATTERN = re.compile(r"\$\$[\s\S]*?\$\$")
@@ -521,16 +521,16 @@ def _to_plain_text(self, markdown_body: str) -> str:
521521
if not base:
522522
return ""
523523

524-
text = CODE_BLOCK_PATTERN.sub(" ", base)
524+
text = CODE_BLOCK_PATTERN.sub(lambda match: f" {match.group(1)} ", base)
525525
text = MATH_BLOCK_PATTERN.sub(" ", text)
526526
text = INLINE_MATH_PATTERN.sub(" ", text)
527-
text = INLINE_CODE_PATTERN.sub(" ", text)
527+
text = INLINE_CODE_PATTERN.sub(lambda match: f" {match.group(1)} ", text)
528528
text = MARKDOWN_IMG_PATTERN.sub(" ", text)
529529
text = HTML_IMG_TAG_PATTERN.sub(" ", text)
530530
text = LINK_PATTERN.sub(lambda match: f" {match.group(1)} ", text)
531531
text = HTML_TAG_PATTERN.sub(" ", text)
532532
text = re.sub(r"(?mi)^\s*#{1,6}\s*images\s*$", " ", text)
533-
text = re.sub(r"[#>*_`~\-]+", " ", text)
533+
text = re.sub(r"[#>*~]+", " ", text)
534534
text = re.sub(r"\s+", " ", text)
535535
return text.strip()
536536

docs/project/entries/git-submodule-quick-notes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ type: guide
1212
status: published
1313
related:
1414
- git
15-
word_count: 40
15+
word_count: 51
1616
image_count: 0
1717
---
1818

docs/project/entries/github-pages-openknowforge-web.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ type: guide
1414
status: published
1515
related:
1616
- openknowforge
17-
word_count: 355
17+
word_count: 428
1818
image_count: 0
1919
---
2020

docs/project/entries/kl-kl-divergence.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ date: '2026-03-26'
1111
type: concept
1212
status: published
1313
related: []
14-
word_count: 132
14+
word_count: 131
1515
image_count: 0
1616
---
1717

docs/project/entries/openknowforge.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ status: published
1515
related:
1616
- notes-explorer
1717
- search-index
18-
word_count: 153
18+
word_count: 249
1919
image_count: 0
2020
---
2121

docs/public/search-index.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"generatedAt": "2026-03-26T15:47:33+00:00",
2+
"generatedAt": "2026-03-26T16:01:55+00:00",
33
"notes": [
44
{
55
"slug": "note",
@@ -24,7 +24,7 @@
2424
"updated_at": "2026-03-26T13:02:27+00:00",
2525
"submitted_at": "2026-03-26T13:02:27+00:00",
2626
"date": "2026-03-26",
27-
"word_count": 132,
27+
"word_count": 131,
2828
"image_count": 0,
2929
"tags": [
3030
"math",
@@ -41,7 +41,7 @@
4141
"updated_at": "2026-03-26T12:29:03+00:00",
4242
"submitted_at": "2026-03-26T12:29:03+00:00",
4343
"date": "2026-03-26",
44-
"word_count": 355,
44+
"word_count": 428,
4545
"image_count": 0,
4646
"tags": [
4747
"github-pages",
@@ -60,7 +60,7 @@
6060
"updated_at": "2026-03-26T11:53:34+00:00",
6161
"submitted_at": "2026-03-26T11:53:34+00:00",
6262
"date": "2026-03-26",
63-
"word_count": 40,
63+
"word_count": 51,
6464
"image_count": 0,
6565
"tags": [
6666
"git",
@@ -77,7 +77,7 @@
7777
"updated_at": "2026-03-26T00:00:00+00:00",
7878
"submitted_at": "2026-03-26T00:00:00+00:00",
7979
"date": "2026-03-26",
80-
"word_count": 153,
80+
"word_count": 249,
8181
"image_count": 0,
8282
"tags": [
8383
"guide",

tests/test_api_note.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,39 @@ def test_post_note_empty_title_fails_validation(client: TestClient) -> None:
382382
assert response.status_code == 422
383383

384384

385+
def test_word_count_includes_code_blocks_and_inline_code(client: TestClient) -> None:
386+
create_resp = client.post(
387+
'/note',
388+
json={
389+
'title': 'Code Count',
390+
'content': '\n'.join(
391+
[
392+
'before',
393+
'```bash',
394+
'git status',
395+
'```',
396+
'`inline_code`',
397+
]
398+
),
399+
'tags': ['code'],
400+
'images': [],
401+
'type': 'note',
402+
'status': 'published',
403+
'related': [],
404+
'submitted_at': '2026-03-26T16:00:00+00:00',
405+
},
406+
)
407+
assert create_resp.status_code == 200
408+
slug = create_resp.json()['result']['slug']
409+
assert create_resp.json()['result']['word_count'] == 4
410+
411+
note_resp = client.get(f'/note/{slug}')
412+
assert note_resp.status_code == 200
413+
note = note_resp.json()['result']
414+
assert note['word_count'] == 4
415+
assert note['image_count'] == 0
416+
417+
385418
def test_existing_note_stats_are_backfilled_once(client: TestClient, tmp_path: Path) -> None:
386419
legacy_note_path = tmp_path / 'docs' / 'project' / 'entries' / 'legacy.md'
387420
legacy_note_path.parent.mkdir(parents=True, exist_ok=True)

0 commit comments

Comments
 (0)