-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtfidf.html
More file actions
1332 lines (1142 loc) · 67.2 KB
/
tfidf.html
File metadata and controls
1332 lines (1142 loc) · 67.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>D-Lab - TF-IDF Tutorial</title>
<style>
:root{
--bg:#ffffff;
--panel:#f8f9fa;
--muted:#6c757d;
--text:#212529;
--blue:#0d6efd;
--indigo:#6610f2;
--emerald:#198754;
--orange:#fd7e14;
--border:#dee2e6;
--red:#dc3545;
--yellow:#ffc107;
--purple:#6f42c1;
}
*{box-sizing:border-box}
body{
margin:0; font-family: ui-sans-serif, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial;
background:var(--bg); color:var(--text);
line-height: 1.6; min-height: 100vh; padding: 20px;
}
.wrap{max-width:1200px;margin:32px auto;padding:0 20px}
.container{
background: white; border-radius: 12px;
box-shadow: 0 4px 12px rgba(0,0,0,0.1); padding: 40px;
}
.course-header{text-align:center;margin-bottom:20px;padding:16px;background:var(--panel);border-radius:12px;border:1px solid var(--border)}
.course-logo{width:180px;height:auto;margin-bottom:8px}
.course-title{font-size:1.2em;color:var(--blue);margin:0;font-weight:600}
h1{font-size:32px;margin:0 0 12px; text-align: center;}
p.lead{color:var(--muted);margin:0 0 24px; font-size: 18px; text-align: center;}
.intro{
background:var(--panel); border:1px solid var(--border); border-radius:16px; padding:24px;
margin-bottom: 24px; box-shadow:0 2px 8px rgba(0,0,0,.1);
}
.intro h2{margin:0 0 12px; color:var(--blue);}
.intro p{margin:12px 0; font-size: 16px;}
.stage{
border:1px solid var(--border); border-radius:16px; padding:24px;
background:var(--bg); margin-bottom:24px;
box-shadow:0 2px 8px rgba(0,0,0,.1);
}
.stage h2{margin:0 0 12px; font-size:20px; color:var(--blue);}
.stage .explanation{
font-size:15px; color:var(--text); margin-bottom:16px;
background:rgba(13,110,253,.08); padding:12px 16px; border-radius:12px;
border-left: 4px solid var(--blue);
}
.documents-section{margin-bottom: 40px;}
.document{
background: var(--panel); padding: 15px; margin-bottom: 15px;
border-radius: 8px; border: 2px solid var(--border); transition: all 0.3s;
}
.document.highlight{border-color: var(--blue); background: rgba(13,110,253,.1);}
.doc-title{font-weight: bold; color: var(--text); margin-bottom: 5px;}
.doc-content{color: var(--text);}
.word-highlight{background: var(--yellow); padding: 2px 4px; border-radius: 3px; font-weight: bold;}
.controls-section{
background: var(--panel); padding: 25px; border-radius: 16px;
margin-bottom: 30px; border: 1px solid var(--border);
}
.control-group{margin-bottom: 25px;}
.control-label{font-weight: bold; display: block; margin-bottom: 8px; color: var(--text);}
select, input[type=range], input[type=number]{
width:100%; background:var(--bg); border:1px solid var(--border); color:var(--text);
border-radius:10px; padding:10px; outline:none; font-size: 14px; cursor: pointer;
}
select:focus{outline: none; border-color: var(--blue);}
.calculations-section{display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; margin-bottom: 30px;}
.calc-card{
background: var(--panel); border: 1px solid var(--border); border-radius: 16px; padding: 20px;
transition: transform 0.3s, box-shadow 0.3s; box-shadow:0 2px 8px rgba(0,0,0,.1);
}
.calc-card:hover{transform: translateY(-2px); box-shadow: 0 4px 16px rgba(0,0,0,0.15);}
.calc-title{font-size: 1.1em; font-weight: 600; color: var(--blue); margin-bottom: 10px;}
.calc-formula{
background: var(--panel); padding: 10px; border-radius: 8px; border: 1px solid var(--border);
font-family: ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas; margin-bottom: 10px; font-size: 0.9em;
}
.calc-value{font-size: 2em; font-weight: bold; color: var(--text); text-align: center; margin-top: 15px;}
.calc-explanation{color: var(--muted); font-size: 0.9em; margin-top: 10px;}
.results-table{width: 100%; border-collapse: collapse; margin-top: 20px;}
.results-table th, .results-table td{padding: 12px; text-align: left; border-bottom: 1px solid var(--border);}
.results-table th{background: var(--blue); color: white; font-weight: bold;}
.results-table tr:hover{background: var(--panel);}
.explanation-box{
background: rgba(255,193,7,.1); border: 1px solid var(--yellow);
border-radius: 12px; padding: 20px; margin-top: 30px;
}
.explanation-box h3{color: var(--orange); margin-bottom: 10px;}
.formula-breakdown{background: var(--bg); padding: 15px; border-radius: 8px; margin-top: 10px; border: 1px solid var(--border);}
.btn{
display: inline-flex; align-items: center; gap: 8px; padding: 10px 14px;
border-radius: 12px; border: 1px solid var(--border); background: var(--bg);
color: var(--text); cursor: pointer; font-size: 14px; margin: 4px;
transition: all 0.3s ease; text-decoration: none;
}
.btn:hover{background: var(--panel); transform: translateY(-1px);}
.btn-primary{background: var(--blue); color: white; border-color: var(--blue);}
.btn-primary:hover{background: #0b5ed7; border-color: #0b5ed7;}
.interactive-word {
cursor: pointer;
transition: all 0.2s ease;
border-radius: 3px;
padding: 1px 2px;
}
.interactive-word:hover {
background: var(--yellow);
transform: scale(1.05);
}
.interactive-word.selected {
background: var(--emerald) !important;
color: white !important;
font-weight: bold;
}
.game-doc {
margin-bottom: 15px;
}
.game-doc:hover {
border-color: var(--blue);
}
.level-indicator {
display: inline-block;
font-size: 14px;
}
@media (max-width: 768px) {
.container{padding: 20px;}
h1{font-size: 1.8em;}
.calculations-section{grid-template-columns: 1fr;}
.game-area{grid-template-columns: 1fr !important;}
}
</style>
</head>
<body>
<div class="wrap">
<div class="container">
<div class="course-header">
<img src="img/dlab-bubble-logo-2025.png" alt="D-Lab Logo" class="course-logo">
</div>
<h1>Understanding TF-IDF</h1>
<p class="lead">Term Frequency - Inverse Document Frequency Explained</p>
<div class="intro">
<p><strong>TF-IDF</strong> is a numerical statistic that reflects how important a word is to a document within a collection of documents. But how do computers actually process text to calculate these statistics?</p>
<p>In this interactive tutorial, we'll build TF-IDF from the ground up, learning:</p>
<ul style="margin: 12px 0; padding-left: 20px;">
<li>How computers represent text as <strong>Document-Term Matrices</strong></li>
<li>Why most text data is <strong>sparse</strong> and how to address this</li>
<li>Essential <strong>preprocessing steps</strong> that clean and normalize text</li>
<li>How <strong>Term Frequency</strong> measures word importance within documents</li>
<li>How <strong>TF-IDF weighting</strong> balances frequency with rarity across documents</li>
</ul>
<p>By the end, you'll understand the complete pipeline from raw text to meaningful document representations!</p>
<button class="btn btn-primary" onclick="showDocuments()" style="margin-top: 15px;">Next: See Document Collection →</button>
</div>
<div class="documents-section" id="documents-stage" style="display: none;">
<h2>Sample Document Collection</h2>
<p style="margin-bottom: 15px; color: #666;">Here's our small corpus of three documents covering different topics. We'll use these to explore how computers process and understand text data.</p>
<div class="document" id="doc1">
<div class="doc-title">Document 1: Technology</div>
<div class="doc-content">The computer processes data efficiently. Advanced algorithms process information rapidly.</div>
</div>
<div class="document" id="doc2">
<div class="doc-title">Document 2: Nature</div>
<div class="doc-content">The forest contains many trees. Wildlife inhabits diverse ecosystems with wildlife everywhere.</div>
</div>
<div class="document" id="doc3">
<div class="doc-title">Document 3: Mixed</div>
<div class="doc-content">Scientists research forest data analysis. Scientists study ecosystem patterns and trends.</div>
</div>
<button class="btn btn-primary" onclick="showDTM()" style="margin-top: 20px;">Next: Document-Term Matrix →</button>
</div>
<!-- Document-Term Matrix Section -->
<div class="stage" id="dtm-stage" style="display: none;">
<h2>Step 1: Representing Text as Data</h2>
<div class="explanation">
How do computers work with text? First, they split text into discrete units called <strong>tokens</strong> (usually individual words). Then they create a <strong>Document-Term Matrix (DTM)</strong>: each row represents a document, each column represents a unique token from the <strong>vocabulary</strong> (all unique tokens across documents), and each cell contains the count of that token in that document. This creates a high-dimensional, sparse data structure.
</div>
<div class="controls-section">
<div class="control-group">
<button class="btn btn-primary" onclick="generateDTM()" id="generateDTMBtn">Generate Document-Term Matrix</button>
<button class="btn" onclick="toggleDTMView()" id="toggleDTMBtn" style="display: none;">Toggle Full Vocabulary</button>
</div>
</div>
<div id="dtm-visualization" style="display: none;">
<h3 style="color: var(--blue); margin-bottom: 15px;">Raw Document-Term Matrix</h3>
<div id="dtm-container" style="overflow-x: auto; border: 1px solid var(--border); border-radius: 8px;">
<table id="dtm-table" style="border-collapse: collapse; width: 100%; min-width: 800px;">
<!-- Will be populated by JavaScript -->
</table>
</div>
<div class="explanation-box" id="dtm-stats" style="margin-top: 20px;">
<h3>Matrix Statistics</h3>
<div id="dtm-stats-content">
<!-- Will be populated by JavaScript -->
</div>
<div class="formula-breakdown">
<strong>Step 2: The Sparsity Problem</strong>
<p style="margin: 10px 0;">This matrix structure creates challenges:</p>
<ul style="margin-top: 10px; margin-left: 20px;">
<li>Most cells are zeros — tokens don't appear in most documents.</li>
<li>As vocabulary size grows, the matrix becomes increasingly sparse.</li>
<li>With 200,000+ possible English words, most documents only use a tiny fraction.</li>
<li>Result: storage inefficiency and noisy, high-dimensional data.</li>
</ul>
<p style="margin: 15px 0; padding: 10px; background: rgba(13,110,253,.08); border-radius: 8px;"><strong>Two main approaches:</strong><br>
1. <strong>Reduce the feature space</strong> — shrink the number of columns (e.g. stopword removal, vocabulary pruning, lemmatization, n-gram limits).<br>
2. <strong>Reweight the values</strong> — keep the same dimensions but make the numbers more informative (e.g. TF-IDF).</p>
</div>
</div>
</div>
<div id="dtm-next-step" style="display: none; text-align: center; margin-top: 20px;">
<button class="btn btn-primary" onclick="showPreprocessing()">Next: Dimensionality Reduction →</button>
</div>
</div>
<!-- Text Preprocessing Section -->
<div class="stage" id="preprocessing-stage" style="display: none;">
<h2>Step 3: Tokenization & Preprocessing as Dimensionality Reduction</h2>
<div class="explanation">
One solution to the sparsity problem is <strong>reducing the feature space</strong> before we calculate any weights. <strong>Tokenization</strong> is how we split text into discrete units, and preprocessing affects both how we tokenize and which tokens we keep. Better tokenization (removing punctuation) and token filtering (removing stopwords, normalizing case) actually <em>shrink the number of columns</em> in our matrix. Let's see the impact on our vocabulary size.
</div>
<div class="controls-section">
<div class="control-group">
<label class="control-label">Preprocessing Steps:</label>
<div style="display: flex; flex-wrap: wrap; gap: 10px; margin-top: 10px;">
<label style="display: flex; align-items: center; gap: 5px;">
<input type="checkbox" id="removeStopwords" onchange="updatePreprocessing()">
Remove stopwords ("the", "are", "is", etc.)
</label>
<label style="display: flex; align-items: center; gap: 5px;">
<input type="checkbox" id="lowercase" onchange="updatePreprocessing()" checked>
Convert to lowercase
</label>
<label style="display: flex; align-items: center; gap: 5px;">
<input type="checkbox" id="removePunctuation" onchange="updatePreprocessing()">
Remove punctuation
</label>
</div>
</div>
</div>
<div id="preprocessing-comparison" style="display: none;">
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 20px; margin-bottom: 20px;">
<div>
<h4 style="color: var(--blue);">Before Preprocessing</h4>
<div id="before-preprocessing" class="document" style="font-family: monospace; font-size: 14px;">
<!-- Will be populated by JavaScript -->
</div>
</div>
<div>
<h4 style="color: var(--emerald);">After Preprocessing</h4>
<div id="after-preprocessing" class="document" style="font-family: monospace; font-size: 14px;">
<!-- Will be populated by JavaScript -->
</div>
</div>
</div>
<div class="explanation-box">
<h3>Dimensionality Reduction Impact</h3>
<div id="preprocessing-stats">
<!-- Will be populated by JavaScript -->
</div>
<div class="formula-breakdown">
<strong>Key Point:</strong> Preprocessing <em>reduces the number of columns</em> in our matrix by shrinking the vocabulary size.
<ul style="margin-top: 10px; margin-left: 20px;">
<li><strong>Better tokenization:</strong> Removing punctuation creates cleaner tokens ("quickly." → "quickly")</li>
<li><strong>Case normalization:</strong> "Computer" and "computer" become the same token</li>
<li><strong>Token filtering:</strong> Common tokens like "the" that appear everywhere are removed</li>
<li><strong>Smaller vocabulary:</strong> Matrix goes from 3×30 to 3×18 terms (fewer columns)</li>
</ul>
<p style="margin-top: 15px; padding: 10px; background: rgba(25,135,84,.08); border-radius: 8px;"><strong>Next up:</strong> We'll explore the <em>reweighting approach</em> using TF-IDF. Note that TF-IDF works on any matrix - preprocessed or raw. In fact, TF-IDF is designed to handle issues like common words (giving them low scores) without requiring preprocessing first!</p>
</div>
</div>
</div>
<div id="preprocessing-next-step" style="display: none; text-align: center; margin-top: 20px;">
<button class="btn btn-primary" onclick="showTermFrequencies()">Next: Term Frequencies (Clean Data) →</button>
</div>
</div>
<!-- Term Frequencies Section -->
<div class="stage" id="term-freq-stage" style="display: none;">
<h2>Step 4: Term Frequency (TF) - Local Importance</h2>
<div class="explanation">
Now we start <strong>reweighting our matrix values</strong>. The first component is <strong>Term Frequency (TF)</strong>, which measures how important a token is <em>within a single document</em>. Instead of raw counts, TF gives us proportions: how often does this token appear relative to the document's total token count?
</div>
<div class="controls-section">
<div class="control-group">
<label class="control-label" for="tfWordSelect">Select a token to analyze:</label>
<select id="tfWordSelect">
<option value="">Choose a token...</option>
<option value="computer">computer</option>
<option value="the">the</option>
<option value="forest">forest</option>
<option value="data">data</option>
<option value="trees">trees</option>
<option value="systems">systems</option>
</select>
</div>
<div class="control-group">
<label class="control-label" for="tfDocSelect">Select a document:</label>
<select id="tfDocSelect">
<option value="1">Document 1: Technology</option>
<option value="2">Document 2: Nature</option>
<option value="3">Document 3: Mixed</option>
</select>
</div>
</div>
<div class="calculations-section" id="tf-calculations" style="display: none;">
<div class="calc-card">
<div class="calc-title">Token Count</div>
<div class="calc-formula">Number of times the token appears in the document</div>
<div class="calc-value" id="wordCountValue">0</div>
<div class="calc-explanation" id="wordCountExplanation"></div>
</div>
<div class="calc-card">
<div class="calc-title">Total Tokens</div>
<div class="calc-formula">Total number of tokens in the document</div>
<div class="calc-value" id="totalWordsValue">0</div>
<div class="calc-explanation" id="totalWordsExplanation"></div>
</div>
<div class="calc-card">
<div class="calc-title">Term Frequency</div>
<div class="calc-formula">TF = Token Count / Total Tokens</div>
<div class="calc-value" id="simpleTefrValue" style="color: var(--blue);">0</div>
<div class="calc-explanation">Frequency of this token in this document</div>
</div>
</div>
<div id="tf-comparison" style="display: none;">
<h3 style="margin-bottom: 15px; color: var(--blue);">Term Frequencies Across All Documents</h3>
<table class="results-table">
<thead>
<tr>
<th>Document</th>
<th>Token Count</th>
<th>Total Tokens</th>
<th>Term Frequency</th>
</tr>
</thead>
<tbody id="tfTableBody">
</tbody>
</table>
</div>
<div class="explanation-box" id="tf-explanation" style="display: none;">
<h3>Understanding Term Frequency</h3>
<div id="tfDynamicExplanation"></div>
<div class="formula-breakdown">
<strong>Key Insights:</strong>
<ul style="margin-top: 10px; margin-left: 20px;">
<li>Term Frequency shows how important a token is within a single document</li>
<li>Higher frequencies suggest the token is more central to that document's topic</li>
<li>TF values range from 0 (token doesn't appear) to 1 (token appears in every position)</li>
</ul>
<p style="margin-top: 15px;"><strong>Limitation:</strong> Term Frequency alone has a problem - common tokens like "the" appear frequently but may not be very meaningful for understanding document content. This is why TF-IDF is useful: it balances term frequency with how rare or common a token is across all documents.</p>
</div>
<button class="btn btn-primary" onclick="showCalculator()" style="margin-top: 20px;">Next: Full TF-IDF Calculator →</button>
</div>
</div>
<!-- TF-IDF Calculator Section -->
<div class="stage" id="calculator-stage" style="display: none;">
<h2>Steps 5-6: TF-IDF = Putting It All Together</h2>
<div class="explanation">
Now for the final reweighting step. <strong>TF-IDF doesn't reduce dimensions</strong> - it keeps the same matrix size but changes the values to better reflect token importance. We combine <strong>Term Frequency (local importance)</strong> with <strong>Inverse Document Frequency (global rarity)</strong>.
</div>
<div class="controls-section">
<div class="control-group">
<label class="control-label" for="wordSelect">Select a token to analyze:</label>
<select id="wordSelect">
<option value="">Choose a token...</option>
<option value="computer">computer</option>
<option value="the">the</option>
<option value="forest">forest</option>
<option value="data">data</option>
<option value="trees">trees</option>
<option value="systems">systems</option>
</select>
</div>
<div class="control-group">
<label class="control-label" for="docSelect">Select a document:</label>
<select id="docSelect">
<option value="1">Document 1: Technology</option>
<option value="2">Document 2: Nature</option>
<option value="3">Document 3: Mixed</option>
</select>
</div>
</div>
<div class="calculations-section" id="calculations" style="display: none;">
<div class="calc-card">
<div class="calc-title">Step 4 Review: Term Frequency (TF)</div>
<div class="calc-formula">TF = (Count of term in doc) / (Total terms in doc)</div>
<div class="calc-value" id="tfValue">0</div>
<div class="calc-explanation" id="tfExplanation"></div>
</div>
<div class="calc-card">
<div class="calc-title">Step 5: Inverse Document Frequency (IDF)</div>
<div class="calc-formula">IDF = log(Total docs / Docs with term)</div>
<div class="calc-value" id="idfValue">0</div>
<div class="calc-explanation" id="idfExplanation"></div>
</div>
<div class="calc-card">
<div class="calc-title">Step 6: TF-IDF Score</div>
<div class="calc-formula">TF-IDF = TF × IDF</div>
<div class="calc-value" id="tfidfValue" style="color: #764ba2;">0</div>
<div class="calc-explanation">Final reweighted importance score for this term in this document</div>
</div>
</div>
<div id="comparisonTable" style="display: none;">
<h3 style="margin-bottom: 15px; color: var(--blue);">TF-IDF Scores Across All Documents</h3>
<table class="results-table">
<thead>
<tr>
<th>Document</th>
<th>Term Count</th>
<th>Term Frequency</th>
<th>TF-IDF Score</th>
</tr>
</thead>
<tbody id="tableBody">
</tbody>
</table>
</div>
<div class="explanation-box" id="explanation" style="display: none;">
<h3>What's Happening?</h3>
<div id="dynamicExplanation"></div>
<div class="formula-breakdown">
<strong>Key Insights:</strong>
<ul style="margin-top: 10px; margin-left: 20px;">
<li>Words that appear frequently in a specific document get higher TF scores</li>
<li>Words that appear in many documents get lower IDF scores</li>
<li>Common words like "the" have low TF-IDF scores because they appear everywhere</li>
<li>Unique, document-specific words have high TF-IDF scores</li>
</ul>
</div>
<button class="btn btn-primary" onclick="showPracticeGame()" style="margin-top: 20px;">Next: Practice Game →</button>
</div>
<!-- Interactive Practice Game Section -->
<div class="stage" id="practice-game-stage" style="display: none;">
<h2>Interactive Practice: Word Detective</h2>
<div class="explanation">
Now let's put your TF-IDF knowledge to the test! <strong>Click on words in the documents</strong> to see their TF-IDF scores. Can you find the most important (highest TF-IDF) word in each document?
</div>
<div class="controls-section">
<div class="control-group">
<button class="btn btn-primary" onclick="startNewChallenge()">🎯 New Challenge</button>
<button onclick="showGameHint()">💡 Hint</button>
<span class="level-indicator" style="background: var(--emerald); color: white; padding: 8px 16px; border-radius: 20px; font-weight: bold; margin-left: auto;">
Score: <span id="game-score">0</span>
</span>
</div>
</div>
<div class="game-area" style="display: grid; grid-template-columns: 1fr 1fr; gap: 30px; margin: 24px 0;">
<div class="game-documents">
<h3 style="color: var(--blue); margin-bottom: 15px;">📄 Interactive Documents</h3>
<p style="color: var(--muted); margin-bottom: 15px; font-size: 14px;">Click on any word to see its TF-IDF score!</p>
<div class="document game-doc" id="game-doc1">
<div class="doc-title">Document 1: Technology</div>
<div class="doc-content interactive-content" id="game-content1">
<!-- Will be populated by JavaScript -->
</div>
</div>
<div class="document game-doc" id="game-doc2">
<div class="doc-title">Document 2: Nature</div>
<div class="doc-content interactive-content" id="game-content2">
<!-- Will be populated by JavaScript -->
</div>
</div>
<div class="document game-doc" id="game-doc3">
<div class="doc-title">Document 3: Mixed</div>
<div class="doc-content interactive-content" id="game-content3">
<!-- Will be populated by JavaScript -->
</div>
</div>
</div>
<div class="game-analysis">
<h3 style="color: var(--blue); margin-bottom: 15px;">🔬 Analysis Lab</h3>
<div id="word-analysis" style="display: none;">
<div class="calc-card" style="margin-bottom: 15px;">
<div class="calc-title">Selected Token</div>
<div class="calc-value" id="selected-token" style="color: var(--blue); font-size: 1.5em;">-</div>
</div>
<div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 15px; margin-bottom: 20px;">
<div class="calc-card">
<div class="calc-title">TF</div>
<div class="calc-value" id="game-tf-value" style="color: var(--orange);">0</div>
</div>
<div class="calc-card">
<div class="calc-title">IDF</div>
<div class="calc-value" id="game-idf-value" style="color: var(--blue);">0</div>
</div>
<div class="calc-card">
<div class="calc-title">TF-IDF</div>
<div class="calc-value" id="game-tfidf-value" style="color: var(--purple);">0</div>
</div>
</div>
<div class="explanation-box">
<div id="word-explanation">
<!-- Will be populated by JavaScript -->
</div>
</div>
</div>
<div id="challenge-status" style="display: none;">
<div class="explanation-box">
<h3>🎯 Challenge Progress</h3>
<div id="challenge-progress">
<!-- Will be populated by JavaScript -->
</div>
<div id="challenge-completion" style="display: none;">
<h4 style="color: var(--emerald);">🎉 Challenge Complete!</h4>
<p>Great job finding the most important words! You've mastered TF-IDF analysis.</p>
<button class="btn btn-primary" onclick="showSummarySection()" style="margin-top: 15px;">Continue to Summary →</button>
</div>
</div>
</div>
</div>
</div>
</div>
<div class="intro" id="summary-stage" style="display: none;">
<h2>Summary</h2>
<p>You've now experienced the complete TF-IDF calculation process through interactive exploration:</p>
<ul style="margin: 12px 0; padding-left: 20px;">
<li><strong>Term Frequency (TF)</strong>: Measures how often a word appears in a specific document</li>
<li><strong>Inverse Document Frequency (IDF)</strong>: Measures how unique a word is across all documents</li>
<li><strong>TF-IDF Score</strong>: Combines both metrics to identify the most important words for each document</li>
<li><strong>Practical applications</strong>: Used in search engines, document classification, and text analysis</li>
</ul>
<div class="goal">
<h3>Limitations of Bag-of-Words Approaches</h3>
<p>While TF-IDF is powerful, bag-of-words approaches have several important limitations:</p>
<ul style="margin: 12px 0; padding-left: 20px;">
<li><strong>Words are treated as completely distinct:</strong> The model doesn't learn that "happy" and "pleased" are similar, or that "he" and "she" are both pronouns, or that "Obama" and "Biden" are both presidents</li>
<li><strong>Vocabulary explosion:</strong> As document sizes grow, the number of features increases dramatically (there are ~200,000 dictionary words in English)</li>
<li><strong>Word order is ignored:</strong> "The acting is good but the script is bad" has the same features as "The acting is bad but the script is good"</li>
<li><strong>No semantic understanding:</strong> The approach can't capture meaning, context, or relationships between concepts</li>
</ul>
<p style="margin-top: 15px;"><strong>The big question:</strong> How do we represent words as features so that similar words are similar? This challenge leads to more advanced approaches like word embeddings and neural language models.</p>
</div>
</div>
</div>
</div>
<script>
// Document data
const documents = {
1: {
title: "Technology",
content: "The computer processes data efficiently. Advanced algorithms process information rapidly.",
words: ["the", "computer", "processes", "data", "efficiently", "advanced", "algorithms", "process", "information", "rapidly"]
},
2: {
title: "Nature",
content: "The forest contains many trees. Wildlife inhabits diverse ecosystems with wildlife everywhere.",
words: ["the", "forest", "contains", "many", "trees", "wildlife", "inhabits", "diverse", "ecosystems", "with", "wildlife", "everywhere"]
},
3: {
title: "Mixed",
content: "Scientists research forest data analysis. Scientists study ecosystem patterns and trends.",
words: ["scientists", "research", "forest", "data", "analysis", "scientists", "study", "ecosystem", "patterns", "and", "trends"]
}
};
function countTermInDoc(term, docId) {
const normalizedTerm = term.toLowerCase();
return documents[docId].words.filter(word =>
word.toLowerCase() === normalizedTerm ||
word.toLowerCase() === normalizedTerm + 's' ||
normalizedTerm === word.toLowerCase() + 's'
).length;
}
function calculateTF(term, docId) {
const count = countTermInDoc(term, docId);
const total = documents[docId].words.length;
return count / total;
}
function calculateIDF(term) {
const totalDocs = Object.keys(documents).length;
let docsWithTerm = 0;
for (let docId in documents) {
if (countTermInDoc(term, docId) > 0) {
docsWithTerm++;
}
}
return Math.log10(totalDocs / docsWithTerm);
}
function highlightWords() {
const selectedWord = document.getElementById('wordSelect').value;
const selectedDoc = document.getElementById('docSelect').value;
// Reset all documents
for (let i = 1; i <= 3; i++) {
const docElement = document.getElementById(`doc${i}`);
const docData = documents[i];
docElement.classList.remove('highlight');
docElement.querySelector('.doc-content').innerHTML = docData.content;
}
if (!selectedWord) return;
// Highlight selected document
document.getElementById(`doc${selectedDoc}`).classList.add('highlight');
// Highlight the word in all documents
for (let i = 1; i <= 3; i++) {
const docElement = document.getElementById(`doc${i}`);
let content = documents[i].content;
// Create regex to match the word (case-insensitive)
const regex = new RegExp(`\\b(${selectedWord}s?|${selectedWord.slice(0, -1)}s?)\\b`, 'gi');
content = content.replace(regex, '<span class="word-highlight">$1</span>');
docElement.querySelector('.doc-content').innerHTML = content;
}
}
function updateCalculations() {
const selectedWord = document.getElementById('wordSelect').value;
const selectedDoc = document.getElementById('docSelect').value;
if (!selectedWord) {
document.getElementById('calculations').style.display = 'none';
document.getElementById('comparisonTable').style.display = 'none';
document.getElementById('explanation').style.display = 'none';
return;
}
// Calculate values
const tf = calculateTF(selectedWord, selectedDoc);
const idf = calculateIDF(selectedWord);
const tfidf = tf * idf;
const termCount = countTermInDoc(selectedWord, selectedDoc);
const totalTerms = documents[selectedDoc].words.length;
// Update TF display
document.getElementById('tfValue').textContent = tf.toFixed(4);
document.getElementById('tfExplanation').textContent =
`"${selectedWord}" appears ${termCount} time(s) out of ${totalTerms} total words`;
// Update IDF display
let docsWithTerm = 0;
for (let docId in documents) {
if (countTermInDoc(selectedWord, docId) > 0) docsWithTerm++;
}
document.getElementById('idfValue').textContent = idf.toFixed(4);
document.getElementById('idfExplanation').textContent =
`Appears in ${docsWithTerm} out of 3 documents`;
// Update TF-IDF display
document.getElementById('tfidfValue').textContent = tfidf.toFixed(4);
// Show calculations
document.getElementById('calculations').style.display = 'grid';
// Update comparison table
const tableBody = document.getElementById('tableBody');
tableBody.innerHTML = '';
for (let docId in documents) {
const row = tableBody.insertRow();
const docTF = calculateTF(selectedWord, docId);
const docTFIDF = docTF * idf;
const docTermCount = countTermInDoc(selectedWord, docId);
row.insertCell(0).textContent = `Document ${docId}: ${documents[docId].title}`;
row.insertCell(1).textContent = docTermCount;
row.insertCell(2).textContent = docTF.toFixed(4);
row.insertCell(3).textContent = docTFIDF.toFixed(4);
if (docId == selectedDoc) {
row.style.background = '#f0f0ff';
row.style.fontWeight = 'bold';
}
}
document.getElementById('comparisonTable').style.display = 'block';
// Update explanation
let explanationText = '';
if (selectedWord === 'the') {
explanationText = `The word "${selectedWord}" is a common word that appears in all documents. Notice how it has a low IDF score (${idf.toFixed(3)}) because it appears everywhere. This makes its TF-IDF score low, correctly identifying it as not very informative for distinguishing between documents.`;
} else if (docsWithTerm === 1) {
explanationText = `The word "${selectedWord}" appears in only one document, giving it a high IDF score (${idf.toFixed(3)}). This makes it highly distinctive and valuable for characterizing that specific document.`;
} else {
explanationText = `The word "${selectedWord}" appears in ${docsWithTerm} out of 3 documents. Its IDF score (${idf.toFixed(3)}) reflects this distribution. The TF-IDF score balances how often it appears in the selected document against how common it is across all documents.`;
}
document.getElementById('dynamicExplanation').textContent = explanationText;
document.getElementById('explanation').style.display = 'block';
}
// Navigation functions for progressive disclosure
function showDocuments() {
document.getElementById('documents-stage').style.display = 'block';
document.getElementById('documents-stage').scrollIntoView({ behavior: 'smooth' });
}
function showDTM() {
document.getElementById('dtm-stage').style.display = 'block';
document.getElementById('dtm-stage').scrollIntoView({ behavior: 'smooth' });
}
function showPreprocessing() {
document.getElementById('preprocessing-stage').style.display = 'block';
document.getElementById('preprocessing-stage').scrollIntoView({ behavior: 'smooth' });
updatePreprocessing();
}
function showTermFrequencies() {
document.getElementById('term-freq-stage').style.display = 'block';
document.getElementById('term-freq-stage').scrollIntoView({ behavior: 'smooth' });
}
function showCalculator() {
document.getElementById('calculator-stage').style.display = 'block';
document.getElementById('calculator-stage').scrollIntoView({ behavior: 'smooth' });
}
function showSummarySection() {
document.getElementById('summary-stage').style.display = 'block';
document.getElementById('summary-stage').scrollIntoView({ behavior: 'smooth' });
}
function updateTermFrequencies() {
const selectedWord = document.getElementById('tfWordSelect').value;
const selectedDoc = document.getElementById('tfDocSelect').value;
if (!selectedWord) {
document.getElementById('tf-calculations').style.display = 'none';
document.getElementById('tf-comparison').style.display = 'none';
document.getElementById('tf-explanation').style.display = 'none';
return;
}
// Show calculations
document.getElementById('tf-calculations').style.display = 'grid';
// Calculate values for selected document
const wordCount = countTermInDoc(selectedWord, selectedDoc);
const totalWords = documents[selectedDoc].words.length;
const tf = calculateTF(selectedWord, selectedDoc);
// Update display
document.getElementById('wordCountValue').textContent = wordCount;
document.getElementById('totalWordsValue').textContent = totalWords;
document.getElementById('simpleTefrValue').textContent = tf.toFixed(3);
// Update explanations
document.getElementById('wordCountExplanation').textContent = `"${selectedWord}" appears ${wordCount} time(s) in Document ${selectedDoc}`;
document.getElementById('totalWordsExplanation').textContent = `Document ${selectedDoc} contains ${totalWords} tokens total`;
// Show comparison table
document.getElementById('tf-comparison').style.display = 'block';
updateTermFrequencyTable(selectedWord);
// Show explanation
document.getElementById('tf-explanation').style.display = 'block';
updateTermFrequencyExplanation(selectedWord, selectedDoc);
// Highlight words in documents
highlightTermFrequencyWords(selectedWord);
}
function updateTermFrequencyTable(word) {
const tableBody = document.getElementById('tfTableBody');
tableBody.innerHTML = '';
for (let docId = 1; docId <= 3; docId++) {
const wordCount = countTermInDoc(word, docId);
const totalWords = documents[docId].words.length;
const tf = calculateTF(word, docId);
const row = tableBody.insertRow();
row.insertCell(0).textContent = `Document ${docId}: ${documents[docId].title}`;
row.insertCell(1).textContent = wordCount;
row.insertCell(2).textContent = totalWords;
row.insertCell(3).textContent = tf.toFixed(3);
}
}
function updateTermFrequencyExplanation(word, docId) {
const wordCount = countTermInDoc(word, docId);
const tf = calculateTF(word, docId);
let explanation = `<p><strong>Analysis for token "${word}" in Document ${docId}:</strong></p>`;
if (wordCount === 0) {
explanation += `<p>The token "${word}" does not appear in Document ${docId}, so its term frequency is 0.</p>`;
} else if (wordCount === 1) {
explanation += `<p>The token "${word}" appears once in Document ${docId}. With ${documents[docId].words.length} total tokens, its frequency is ${tf.toFixed(3)} (${(tf * 100).toFixed(1)}%).</p>`;
} else {
explanation += `<p>The token "${word}" appears ${wordCount} times in Document ${docId}. With ${documents[docId].words.length} total tokens, its frequency is ${tf.toFixed(3)} (${(tf * 100).toFixed(1)}%).</p>`;
}
// Compare across documents
const allTFs = [];
for (let i = 1; i <= 3; i++) {
allTFs.push(calculateTF(word, i));
}
const maxTF = Math.max(...allTFs);
const maxDoc = allTFs.indexOf(maxTF) + 1;
if (maxTF > 0) {
explanation += `<p><strong>Cross-document comparison:</strong> Token "${word}" has the highest frequency in Document ${maxDoc} (${maxTF.toFixed(3)}), making it most relevant to that document's topic.</p>`;
}
document.getElementById('tfDynamicExplanation').innerHTML = explanation;
}
function highlightTermFrequencyWords(selectedWord) {
// Reset all highlights
for (let docId = 1; docId <= 3; docId++) {
const docElement = document.getElementById(`doc${docId}`);
docElement.classList.remove('highlight');
const content = documents[docId].content;
docElement.querySelector('.doc-content').innerHTML = content;
}
if (!selectedWord) return;
// Highlight selected word in all documents
for (let docId = 1; docId <= 3; docId++) {
const docElement = document.getElementById(`doc${docId}`);
const content = documents[docId].content;
const regex = new RegExp(`\\b(${selectedWord}s?|${selectedWord.slice(0, -1)}s?)\\b`, 'gi');
const highlightedContent = content.replace(regex, '<span class="word-highlight">$1</span>');
docElement.querySelector('.doc-content').innerHTML = highlightedContent;
if (countTermInDoc(selectedWord, docId) > 0) {
docElement.classList.add('highlight');
}
}
}
// DTM and preprocessing functions
let fullVocabulary = [];
let dtmData = {};
let showFullDTM = false;
function generateDTM() {
// Build full vocabulary from all documents
const vocab = new Set();
for (let docId in documents) {
documents[docId].words.forEach(word => {
vocab.add(word.toLowerCase());
});
}
fullVocabulary = Array.from(vocab).sort();
// Create DTM data structure
dtmData = {};
for (let docId in documents) {
dtmData[docId] = {};
fullVocabulary.forEach(word => {
dtmData[docId][word] = countTermInDoc(word, docId);
});
}
// Show visualization
document.getElementById('dtm-visualization').style.display = 'block';
document.getElementById('toggleDTMBtn').style.display = 'inline-flex';
document.getElementById('dtm-next-step').style.display = 'block';
renderDTM();
}
function renderDTM() {
const table = document.getElementById('dtm-table');
const displayVocab = showFullDTM ? fullVocabulary : fullVocabulary.slice(0, 10);
// Clear table
table.innerHTML = '';
// Create header
const headerRow = table.insertRow();
const cornerCell = headerRow.insertCell();
cornerCell.innerHTML = '<strong>Document \\ Term</strong>';
cornerCell.style.cssText = 'padding: 12px; border: 1px solid var(--border); background: var(--blue); color: white; font-weight: bold;';
displayVocab.forEach(word => {
const cell = headerRow.insertCell();
cell.innerHTML = `<strong>${word}</strong>`;
cell.style.cssText = 'padding: 12px; border: 1px solid var(--border); background: var(--blue); color: white; font-weight: bold; writing-mode: vertical-lr; text-orientation: mixed; min-width: 40px;';
});
if (!showFullDTM && fullVocabulary.length > 10) {
const moreCell = headerRow.insertCell();
moreCell.innerHTML = '<strong>...</strong>';
moreCell.style.cssText = 'padding: 12px; border: 1px solid var(--border); background: var(--blue); color: white; font-weight: bold;';
}
// Create data rows
for (let docId in documents) {
const row = table.insertRow();
// Document label
const labelCell = row.insertCell();
labelCell.innerHTML = `<strong>Doc ${docId}: ${documents[docId].title}</strong>`;
labelCell.style.cssText = 'padding: 12px; border: 1px solid var(--border); background: var(--panel); font-weight: bold;';
// Term counts
displayVocab.forEach(word => {
const count = dtmData[docId][word];
const cell = row.insertCell();
cell.textContent = count;
let bgColor = count === 0 ? '#ffebee' : count === 1 ? '#e8f5e8' : '#c8e6c9';
let textColor = count === 0 ? '#c62828' : '#2e7d32';
cell.style.cssText = `padding: 12px; border: 1px solid var(--border); text-align: center; background: ${bgColor}; color: ${textColor}; font-weight: bold;`;
});
if (!showFullDTM && fullVocabulary.length > 10) {
const moreCell = row.insertCell();
const hiddenCount = fullVocabulary.slice(10).reduce((sum, word) => sum + dtmData[docId][word], 0);
moreCell.textContent = `+${hiddenCount} more`;
moreCell.style.cssText = 'padding: 12px; border: 1px solid var(--border); text-align: center; background: var(--panel); color: var(--muted); font-style: italic;';
}
}
updateDTMStats();
}
function toggleDTMView() {
showFullDTM = !showFullDTM;
const btn = document.getElementById('toggleDTMBtn');
btn.textContent = showFullDTM ? 'Show Fewer Terms' : 'Show All Terms';
renderDTM();
}
function updateDTMStats() {
const totalCells = Object.keys(documents).length * fullVocabulary.length;
let nonZeroCells = 0;
for (let docId in dtmData) {
for (let word of fullVocabulary) {
if (dtmData[docId][word] > 0) nonZeroCells++;
}
}