-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCFG.cpp
More file actions
2782 lines (2404 loc) · 124 KB
/
CFG.cpp
File metadata and controls
2782 lines (2404 loc) · 124 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "CFG.h"
#include "Utilities/Unicode.h"
#include <vector>
#include <set>
#include <queue>
#include <sstream>
#include <iostream>
#include <cctype>
#include <map>
#include <functional>
#include <algorithm>
#include <random>
#include <unordered_map>
#include <unordered_set>
using namespace std;
namespace CFG {
namespace {
/* Generic fixed-point iteration routine. Many algorithms on CFGs make use of
* fixed-point iteration on the nonterminals, and this factors that logic out into
* a separate routine.
*/
template <typename Result>
map<char32_t, Result> fixedPointIterate(const CFG& cfg,
function<bool (const Production&, map<char32_t, Result>&)> update) {
/* Initialize the map. */
map<char32_t, Result> result;
for (char32_t nonterminal: cfg.nonterminals) {
result[nonterminal] = Result();
}
/* Repeat until convergence. */
bool changed;
do {
changed = false;
for (const auto& production: cfg.productions) {
changed |= update(production, result);
}
} while (changed);
return result;
}
}
bool operator== (const Symbol& lhs, const Symbol& rhs) {
return lhs.ch == rhs.ch && lhs.type == rhs.type;
}
ostream& operator<< (ostream& out, const Symbol& symbol) {
ostringstream builder;
if (symbol.type == Symbol::Type::TERMINAL) builder << toUTF8(symbol.ch);
else builder << "<" << toUTF8(symbol.ch) << ">";
return out << builder.str();
}
ostream& operator<< (ostream& out, const Production& prod) {
ostringstream builder;
builder << "<" << toUTF8(prod.nonterminal) << "> -> ";
if (prod.replacement.empty()) {
builder << "ε";
} else {
for (auto symbol: prod.replacement) {
builder << symbol;
}
}
return out << builder.str();
}
ostream& operator<< (ostream& out, const Derivation& d) {
ostringstream builder;
for (size_t i = 0; i < d.size(); i++) {
builder << "(" << d[i].first << " @" << d[i].second << ")";
if (i + 1 != d.size()) builder << ", ";
}
return out << builder.str();
}
bool operator< (const Symbol& lhs, const Symbol& rhs) {
if (lhs.type != rhs.type) return int(lhs.type) < int(rhs.type);
return lhs.ch < rhs.ch;
}
bool operator< (const Production& lhs, const Production& rhs) {
if (lhs.nonterminal != rhs.nonterminal) return lhs.nonterminal < rhs.nonterminal;
return lhs.replacement < rhs.replacement;
}
namespace {
/* Nullablity information is a map from nullable nonterminals to the production
* they should use as their first step toward null.
*/
using Nulls = std::map<char32_t, const Production*>;
Nulls nullablesOf(const CFG& cfg) {
Nulls result;
bool changed;
do {
changed = false;
/* Loop over all productions and see if any of them give us a new way to null. */
for (const auto& p: cfg.productions) {
/* Already nullable? Skip. */
if (result.count(p.nonterminal)) continue;
/* See if all terms in the production are nullable. */
if (all_of(p.replacement.begin(), p.replacement.end(), [&](Symbol s) {
return s.type == Symbol::Type::NONTERMINAL && result.count(s.ch);
})) {
result[p.nonterminal] = &p;
changed = true;
}
}
} while (changed);
return result;
}
string toString(const set<char32_t>& s) {
ostringstream result;
result << "{ ";
for (char32_t ch: s) {
result << toUTF8(ch) << " ";
}
result << "}";
return result.str();
}
}
ostream& operator<< (ostream& out, const CFG& cfg) {
out << "Start: " << toUTF8(cfg.startSymbol) << endl;
out << "Alphabet: " << toString(cfg.alphabet) << endl;
out << "Nonterminals: " << toString(cfg.nonterminals) << endl;
out << "Productions:" << endl;
for (const auto& p: cfg.productions) {
out << " " << p << endl;
}
return out;
}
/**************************************************************************
**************************************************************************
*** Earley Parser Implementation ***
**************************************************************************
**************************************************************************
This is the logic for the matcher and deriver. It uses Earley's algorithm,
a simple general context-free grammar parser that runs in worst-case cubic
time as a function of the string length. The core Earley algorithm is
fairly accessible; much of the complexity here is due to the logistics of
handling epsilon productions.
The hardest part here is the deriver, which reconstructs a derivation for
a string in the language. The comments at that section explain how that's
done. It's essentially a double backtracking search and was way harder to
write than I thought it was going to be. :-)
*************************************************************************/
namespace {
struct EarleyItem {
const Production* production; // Use pointers for identity semantics when comparing
size_t dotPos; // Where the dot is
size_t itemPos; // Where this item starts
};
/* State of the Earley parse. */
struct EarleyState {
/* Earley items per slot. */
vector<set<EarleyItem>> items;
/* Nullable nonterminals; used for quick lookup during prediction. */
Nulls nullable;
/* Productions by nonterminal; used for quick lookup during prediction. */
map<char32_t, vector<const Production*>> grammar;
};
/* For debugging. */
ostream& operator<< (ostream& out, const EarleyItem& item) {
out << "<" << toUTF8(item.production->nonterminal) << "> -> ";
for (size_t i = 0; i < item.production->replacement.size(); i++) {
if (item.dotPos == i) cout << ".";
out << toUTF8(item.production->replacement[i].ch);
}
if (item.dotPos == item.production->replacement.size()) {
out << ".";
}
return out << " @" << item.itemPos;
}
/* So we can stash them in a std::set. */
bool operator< (const EarleyItem& lhs, const EarleyItem& rhs) {
if (lhs.production != rhs.production) {
/* Legal; will always be drawn from the same vector. */
return lhs.production < rhs.production;
} else if (lhs.dotPos != rhs.dotPos) {
return lhs.dotPos < rhs.dotPos;
} else {
return lhs.itemPos < rhs.itemPos;
}
}
/* cctype replacements for char32_t's. */
bool isASCII(char32_t ch) {
return 0 <= static_cast<int>(ch) && ch <= 127;
}
bool isSpace(char32_t ch) {
return isASCII(ch) && isspace(static_cast<int>(ch));
}
/* Given a string in UTF-8 formatting, breaks it apart into individual characters,
* represented as strings with UTF-8 formatting.
*/
vector<char32_t> utf8Decode(const string& input, const Languages::Alphabet& alphabet) {
vector<char32_t> result;
for (char32_t ch: utf8Reader(input)) {
if (isSpace(ch)) continue;
if (!alphabet.count(ch)) throw runtime_error("Invalid character: " + toUTF8(ch));
result.push_back(ch);
}
return result;
}
/* Utility for Earley: is item's dot at end? */
bool dotAtEnd(const EarleyItem& item) {
return item.dotPos == item.production->replacement.size();
}
/* Utility for Earley: item after the dot. */
Symbol afterDot(const EarleyItem& item) {
if (dotAtEnd(item)) abort(); // Logic error!
return item.production->replacement[item.dotPos];
}
/* Utility for Earley: Shifts the dot over. This produces a new item rather than
* editing the existing one.
*/
EarleyItem advanceDot(const EarleyItem& item) {
if (dotAtEnd(item)) abort(); // Logic error!
auto result = item;
result.dotPos++;
return result;
}
EarleyItem retreatDot(const EarleyItem& item) {
if (item.dotPos == 0) abort(); // Logic error!
auto result = item;
result.dotPos--;
return result;
}
const bool kParserVerbose = false;
/* Adds an item to an Earley parser. */
bool addItem(EarleyState& state, size_t index, const EarleyItem& item) {
if (index > state.items.size() || item.itemPos > index) abort(); // Logic error!
return state.items[index].insert(item).second;
}
/* "Scan" step of the Earley algorithm. */
void scan(EarleyState& state, size_t index, char32_t ch) {
if (kParserVerbose) cout << "SCAN " << index << endl;
/* See if anything here needs to have the dot shifted. */
for (const auto& item: state.items[index]) {
/* Item after the dot matches this character. */
if (kParserVerbose) cout << "Considering " << item << endl;
if (!dotAtEnd(item) && afterDot(item) == Symbol{Symbol::Type::TERMINAL, ch}) {
if (kParserVerbose) cout << " Will scan." << endl;
/* Shift the dot over. */
auto next = advanceDot(item);
if (kParserVerbose) cout << " Result: " << next << endl;
addItem(state, index + 1, next);
}
}
if (kParserVerbose) cout << endl;
}
/* "Complete" step of the Earley algorithm. Returns whether anything
* changed in the course of running this process.
*/
bool complete(EarleyState& state, size_t index) {
bool result = false;
if (kParserVerbose) cout << "COMPLETE " << index << endl;
/* Create a worklist of items to process. */
queue<EarleyItem> worklist;
for (const auto& item: state.items[index]) {
if (kParserVerbose) cout << " Considering " << item << endl;
if (dotAtEnd(item)) { // At end
if (kParserVerbose) cout << " Adding to worklist." << endl;
worklist.push(item);
}
}
/* Keep processing items until none are left. */
while (!worklist.empty()) {
auto item = worklist.front();
worklist.pop();
if (kParserVerbose) cout << " Processing " << item << endl;
/* Track back to where this item came from and push all dots before
* this nonterminal forward.
*/
for (const auto& pred: state.items[item.itemPos]) {
if (kParserVerbose) cout << " Possible predecessor: " << pred << endl;
/* Dot not at end, and is before this nonterminal. */
if (!dotAtEnd(pred) && afterDot(pred) == Symbol{Symbol::Type::NONTERMINAL, item.production->nonterminal}) {
if (kParserVerbose) cout << " Will complete." << endl;
/* Shift the dot forward. */
auto next = advanceDot(pred);
if (kParserVerbose) cout << " Completes to " << next << endl;
/* If we haven't already seen this, and if it's at the end, we need to
* process it too.
*/
if (addItem(state, index, next)) {
result = true;
if (kParserVerbose) cout << " This is new - do we need to process?" << endl;
if (dotAtEnd(next)) {
if (kParserVerbose) cout << " Yes!" << endl;
worklist.push(next);
}
else if (kParserVerbose) cout << " No" << endl;
}
}
}
}
if (kParserVerbose) cout << endl;
return result;
}
/* "Predict" step of Earley parser. */
bool predict(EarleyState& state, size_t index) {
bool result = false;
if (kParserVerbose) cout << "PREDICT " << index << endl;
/* Worklist; we may discover many things. */
queue<EarleyItem> worklist;
/* Seed queue with all items with dots before nonterminals. */
for (const auto& item: state.items[index]) {
if (kParserVerbose) cout << " Considering " << item << endl;
if (!dotAtEnd(item) && afterDot(item).type == Symbol::Type::NONTERMINAL) {
if (kParserVerbose) cout << " Adding to worklist." << endl;
worklist.push(item);
}
}
/* Keep processing until done. */
while (!worklist.empty()) {
auto item = worklist.front();
worklist.pop();
if (dotAtEnd(item) || afterDot(item).type == Symbol::Type::TERMINAL) abort(); // Logic error!
if (kParserVerbose) cout << " Processing " << item << endl;
/* Find all productions we can use here. */
char32_t nonterminal = afterDot(item).ch;
for (const auto* prod: state.grammar[nonterminal]) {
if (kParserVerbose) cout << " Considering production " << prod << endl;
/* If we haven't yet seen this, we may need to put it into the queue
* too for nonterminals at the front.
*/
if (kParserVerbose) cout << " Match!" << endl;
EarleyItem predicted = { prod, 0, index };
if (kParserVerbose) cout << " Produced " << predicted << endl;
if (addItem(state, index, predicted)) {
result = true;
if (kParserVerbose) cout << " This is new. Process it?" << endl;
/* Nonterminal at the front? Add it in. */
if (!dotAtEnd(predicted) && afterDot(predicted).type == Symbol::Type::NONTERMINAL) {
if (kParserVerbose) cout << " Yes!" << endl;
worklist.push(predicted);
}
else if (kParserVerbose) cout << " No" << endl;
}
}
}
if (kParserVerbose) cout << endl;
return result;
}
void completeAndPredict(EarleyState& state, size_t index) {
/* Run predictor and completer until convergence. */
bool completeUpdate;
bool predictUpdate;
do {
completeUpdate = complete(state, index);
predictUpdate = predict(state, index);
} while (completeUpdate || predictUpdate);
}
/* For debugging. */
void printItems(const EarleyState& state, size_t index) {
cout << "Items at index " << index << ": " << endl;
for (const auto& item: state.items.at(index)) {
cout << " " << item << endl;
}
}
/* Earley parsing algorithm. We run in a loop of scan/complete/predict, starting
* at predict, and see if we find the proper Earley item in the last column.
*
* We use a slightly different indexing system than the traditional Earley position.
* Position 0 is just before the first character, and position i+1 is just after the
* ith character.
*
* TODO: This can be cleaned up / optimized as follows. Iterate over the item sets
* as usual, but use the sets themselves as the worklist! Use the idea from "Practical
* Earley Parsing" of running predict (shifting dots over nullables), then complete,
* then scan. This avoids the tight loop of complete/predict.
*/
EarleyState earley(char32_t start,
const Nulls& nullable,
const map<char32_t, vector<const Production*>>& grammar,
const vector<char32_t>& input) {
EarleyState state;
state.nullable = nullable;
state.grammar = grammar;
state.items.resize(input.size() + 1);
/* Seed with the initial Earley items. */
if (kParserVerbose) cout << "SEEDING" << endl;
for (const auto* prod: state.grammar[start]) {
if (kParserVerbose) cout << "Including this production." << endl;
addItem(state, 0, { prod, 0, 0 });
}
/* Run the Earley parser. */
for (size_t i = 0; i <= input.size(); i++) {
if (kParserVerbose) printItems(state, i);
completeAndPredict(state, i);
if (i != input.size()) scan(state, i, input[i]);
}
if (kParserVerbose) cout << endl;
return state;
}
/* Given the result of an Earley parse, returns some final item from that
* parse (or null if one doesn't exist).
*/
const EarleyItem* acceptingItem(const EarleyState& state, char32_t start) {
/* See if we match. */
for (const auto& item: state.items.back()) {
if (kParserVerbose) cout << "Final item: " << item << endl;
if (item.production->nonterminal == start && // Start symbol
item.itemPos == 0 && // occurs at the beginning
item.dotPos == item.production->replacement.size()) { // is complete
if (kParserVerbose) cout << " Success!" << endl;
return &item;
}
}
return nullptr;
}
/* Acceptance works by running Earley and seeing if we have any matching items. */
bool accepts(char32_t start,
const Nulls& nullable,
const map<char32_t, vector<const Production*>>& grammar,
const vector<char32_t>& input) {
return acceptingItem(earley(start, nullable, grammar, input), start) != nullptr;
}
/* Given a nonterminal and a position, creates a sequence of Earley items corresponding
* to that nonterminal getting replaced by epsilon.
*/
vector<EarleyItem> nullingSequenceFor(const EarleyState& state,
char32_t nonterminal,
size_t position) {
if (!state.nullable.count(nonterminal)) abort(); // Logic error!
/* Seed things with an initial Earley item. */
vector<EarleyItem> result = { { state.nullable.at(nonterminal), 0, position } };
/* Keep appending. */
while (!dotAtEnd(result[0])) {
auto next = nullingSequenceFor(state, afterDot(result[0]).ch, position);
result.insert(result.end(), next.begin(), next.end());
result[0] = advanceDot(result[0]);
}
return result;
}
const bool kDeriverVerbose = false;
/* Given the result of an Earley parse, returns a derivation of the input string.
*
* The basic idea behind this algorithm is not that complicated. We begin with
* some Earley item we want to come up with a derivation for, say,
*
* S -> alpha X . beta @n #k
*
* Here, the "#k" refers to the fact that the item is in slot k, meaning that
* this Earley item spans the range [n, k).
*
* The basic idea is the following. If X is a terminal, then we know the item
*
* S -> alpha .X beta @n #k-1
*
* exists, so we recursively get a derivation for it. Otherwise, X is a nonterminal,
* so we search for an item of the form
*
* S -> alpha .X beta @n #i
*
* where there's also an item
*
* X -> gamma. @i #k
*
* and get derivations for these items. Putting those together gives us our derivation.
*
* The problem is that grammars can have loops in them, and we need to be extra vigilant
* about this. In particular, while S -> alpha .X beta @n #i can't loop (we enforce that
* i < k), the rule X -> gamma. @i #k *can* cause a loop if i = n. This means that, as
* we're running this algorithm, we need to make sure that we don't get caught in a loop.
*
* The specific way we'll do this is the following. We'll name the two items here the
* "left" and "right" items. The "left" item is the (earlier) item that never can trigger
* a loop, since we monotonically move to the left. The "right" item can trigger a loop.
* To ensure that it doesn't, we'll keep track of all the nonterminals that have appeared
* on the left side of the right production *at the specific position spanned by the current
* Earley item*. We then ignore all productions that would lead to loops, doing so via a
* backtracking search.
*
* This can also manifest as failures in the LEFT side as well. Those failures occur if we
* picked an item for the left that works, but requires a self-loop.
*
* More generally, this algorithm can fail if the item we're working with could potentially
* derive the string in question, but only via at least one self-loop.
*
* There's one more issue we need to handle, and that's nullable nonterminals. Specifically,
* we may find that we need a pair
*
* S -> alpha . X beta @n #i
* X -> gamma. @i #k
*
* where the left item is at index k. But that can easily trigger loops in the left item, which
* we want to avoid. To address this, we'll use a shortcut. In the event that we have
*
* S -> alpha X . beta @n #k
*
* and X is nullable, then the item
*
* S -> alpha . X beta @n #k
*
* may also exist in this spot. If it does, we'll do a backtracking search to see whether this
* leads us to our goal.
*/
pair<vector<EarleyItem>, bool>
derivationOfRec(const EarleyState& state,
const EarleyItem& item, size_t position,
const set<char32_t>& used, // Nonterminals that were used in the current range
size_t indent = 2) {
if (kDeriverVerbose) {
cout << string(indent, ' ') << "Deriving " << item << " #" << position << " with these restrictions:" << endl;
cout << string(indent, ' ') << "{ ";
for (char32_t ch: used) {
cout << toUTF8(ch) << " ";
}
cout << "}" << endl;
}
/* Base case: If the dot is at the front of the production, nothing more
* is required.
*/
if (item.dotPos == 0) return {{}, true};
auto prev = retreatDot(item);
/* Recursive case 1: Dot before a terminal means we shift it back. */
if (afterDot(prev).type == Symbol::Type::TERMINAL) {
if (kDeriverVerbose) cout << string(indent, ' ') << "Terminal shift." << endl;
/* Position has changed. Reset the 'used' set because we didn't hit any loops
* at this interval.
*/
return derivationOfRec(state, prev, position - 1, { }, indent + 4);
}
/* Recursive case 2: Dot before a nullable nonterminal. If we can, shift the
* dot back one position.
*/
auto nonterminal = afterDot(prev).ch;
if (state.nullable.count(nonterminal) && state.items[position].count(prev)) {
if (kDeriverVerbose) cout << string(indent, ' ') << "Nullable shift." << endl;
/* Position has NOT changed; do not reset 'used.' */
auto result = derivationOfRec(state, prev, position, used, indent + 4);
if (result.second) {
auto nulling = nullingSequenceFor(state, nonterminal, position);
result.first.insert(result.first.end(), nulling.begin(), nulling.end());
return result;
}
if (kDeriverVerbose) cout << string(indent, ' ') << "Nullable shift failed." << endl;
}
/* Recursive case 3: Dot before a nonterminal. Given that we have
*
* S -> alpha X . beta @n #k
*
* we want to search for a pair
*
* S -> alpha . X beta @n #i
* X -> gamma. @i #k
*
* We need to be careful here not to allow for loops; there are several
* cases where X -> gamma. @i #k could result in a loop here!
*
* We search all item slots *before* this one because epsilons are handled
* in the above step.
*/
for (size_t i = 0; i < position; i++) {
/* See if the retreated dot item is here. */
if (kDeriverVerbose) cout << string(indent, ' ') << "Index " << i << endl;
if (state.items[i].count(prev)) {
if (kDeriverVerbose) cout << string(indent, ' ') << "Found " << prev << " in slot " << i << endl;
if (kDeriverVerbose) cout << string(indent, ' ') << "Searching for production <" << toUTF8(nonterminal) << "> -> alpha. @" << i << " #" << position << endl;
/* See if we can find a completed item for the nonterminal in the
* current slot.
*/
for (const auto& next: state.items[position]) {
if (!dotAtEnd(next) || next.production->nonterminal != nonterminal || next.itemPos != i) {
if (kDeriverVerbose) cout << string(indent, ' ') << " Wrong position; needed " << i << endl;
continue;
}
if (kDeriverVerbose) cout << string(indent, ' ') << "Considering " << next << " #" << position << endl;
if (next.itemPos == item.itemPos && used.count(next.production->nonterminal)) {
if (kDeriverVerbose) cout << string(indent, ' ') << " Already used this nonterminal here." << endl;
continue;
}
if (kDeriverVerbose) cout << string(indent, ' ') << "Splitting into " << prev << " #" << i << " and " << next << " #" << position << endl;
/* We now have our pair. See whether we can derive the right. */
pair<vector<EarleyItem>, bool> rhs;
/* If this item has the same range as the current one, blacklist the
* nonterminal involved.
*/
if (next.itemPos == item.itemPos) {
auto nextUsed = used;
nextUsed.insert(next.production->nonterminal);
rhs = derivationOfRec(state, next, position, nextUsed, indent + 4);
}
/* Otherwise, we've made progress, so we can reset the used set. */
else {
rhs = derivationOfRec(state, next, position, {}, indent + 4);
}
/* If this failed, move on. */
if (!rhs.second) {
if (kDeriverVerbose) cout << string(indent, ' ') << "Loop detected; fail." << endl;
continue;
}
/* Right side worked, which is great! Now get the left side. */
auto lhs = derivationOfRec(state, prev, i, {}, indent + 4);
/* This could also fail if we picked an LHS item that requires us
* to use a self-loop. So if that happens, skip it and move on.
*/
if (!lhs.second) {
if (kDeriverVerbose) cout << string(indent, ' ') << "Loop detected; fail." << endl;
continue;
}
vector<EarleyItem> result;
result.insert(result.end(), lhs.first.begin(), lhs.first.end());
result.push_back(next);
result.insert(result.end(), rhs.first.begin(), rhs.first.end());
return { result, true };
}
}
}
if (kDeriverVerbose) cout << string(indent, ' ') << "FAILURE; backtracking..." << endl;
return { {}, false };
}
Derivation derivationOf(char32_t start,
const Nulls& nullable,
const map<char32_t, vector<const Production*>>& grammar,
const vector<char32_t>& input) {
auto state = earley(start, nullable, grammar, input);
/* Try all possible derivations from the end and see if any of them work. */
for (const auto& item: state.items.back()) {
if (kDeriverVerbose) cout << "Inspecting item " << item << endl;
if (dotAtEnd(item) && item.itemPos == 0 && item.production->nonterminal == start) {
/* See if we can find a derivation here. */
auto derivation = derivationOfRec(state, item, input.size(), { item.production->nonterminal });
if (!derivation.second) continue;
/* Fencepost issue; this first one isn't added in. */
derivation.first.insert(derivation.first.begin(), item);
/* Translate from a list of Earley items into what we actually want. */
Derivation result;
for (const auto& item: derivation.first) {
result.push_back({ *item.production, item.itemPos });
}
return result;
}
}
return {};
}
/* Given a CFG, converts it to a format usable in Earley parsing. */
map<char32_t, vector<const Production*>> toEarleyGrammar(const CFG& cfg) {
map<char32_t, vector<const Production*>> result;
for (const auto& prod: cfg.productions) {
result[prod.nonterminal].push_back(&prod);
}
return result;
}
Matcher earleyMatcherFor(const CFG& cfg) {
/* Clone the grammar locally so that internal pointers stay valid. */
auto grammarRef = make_shared<CFG>(cfg);
auto nullable = nullablesOf(*grammarRef);
auto grammar = toEarleyGrammar(*grammarRef);
return [=](const string& input) {
return accepts(grammarRef->startSymbol, nullable, grammar, utf8Decode(input, grammarRef->alphabet));
};
}
}
Deriver deriverFor(const CFG& cfg) {
/* Clone the grammar locally so that internal pointers stay valid. */
auto grammarRef = make_shared<CFG>(cfg);
auto nullable = nullablesOf(*grammarRef);
auto grammar = toEarleyGrammar(*grammarRef);
return [=](const string& input) {
return derivationOf(grammarRef->startSymbol, nullable, grammar, utf8Decode(input, grammarRef->alphabet));
};
}
/**************************************************************************
**************************************************************************
*** McKenzie Generator Implementation ***
**************************************************************************
**************************************************************************
The logic below is for the algorithm to generate random strings from a CFG
using McKenzie's algorithm.
Routines to sample strings from a CFG. I am indebted to former CS103 TA and
all-around superstar Kevin Gibbons for sourcing the paper "Generating
Strings at Random from a Context-Free Grammar" by Bruce McKenzie, for
spotting typos in the paper and correcting them, and for working out that
it's necessary to eliminate epsilon productions and self-loops in the
grammar before first running this algorithm.
*************************************************************************/
namespace {
/* Type representing the 5D (!) Tail table.
*
* Nonterminal x Length x Production x Start -> [ numbers ]
*/
using TailTable = map<char32_t, map<size_t, map<const Production*, map<size_t, vector<size_t>>>>>;
/* Representation of a grammar, optimized for fast lookups of productions for a nonterminal. */
using McKenzieGrammar = map<char32_t, vector<Production>>;
struct McKenzieGenerator {
McKenzieGenerator(const CFG& cfg);
pair<bool, string> operator()(size_t) const;
/* Cleaned grammar; used for generation. It's stored this way because
* we need the ability to index productions by number.
*/
McKenzieGrammar grammar;
/* Start symbol. */
char32_t start;
/* Stored tail table (see below). This is mutable because it can be populated on demand. */
mutable TailTable tail;
/* Grammar can generate empty string. Is that the case? */
bool hasEpsilon;
};
namespace {
/* The random number generator. */
mt19937 theGenerator;
/* Given a production, produces all nonempty productions that can be formed by
* taking subsets of the nullable nonterminals.
*/
void generateSubsetsOf(const Production& p,
const Nulls& nullable,
set<Production>& result) {
std::function<void(Production&, size_t)> rec =
[&](Production& soFar, size_t index) {
/* If at end, add unless we're empty. */
if (index == p.replacement.size()) {
if (!soFar.replacement.empty()) result.insert(soFar);
return;
}
/* Include next character. */
soFar.replacement.push_back(p.replacement[index]);
rec(soFar, index+1);
soFar.replacement.pop_back();
/* If next is nullable, skip it. */
if (p.replacement[index].type == Symbol::Type::NONTERMINAL &&
nullable.count(p.replacement[index].ch)) {
rec(soFar, index + 1);
}
};
Production builder;
builder.nonterminal = p.nonterminal;
rec(builder, 0);
}
/* Given a CFG, returns the epsilon normal form of that CFG. This is formed by
* replacing all epsilon productions with new productions by "dropping out"
* nonterminals that are nullable in all (nonempty) combinations.
*
* TODO: This is misleading because it doesn't add epsilon back in at the end
* if the start symbol is nullable. Be careful!
*/
CFG epsilonNormalFormOf(const CFG& cfg, const Nulls& nullable) {
set<Production> newProds;
for (const auto& prod: cfg.productions) {
generateSubsetsOf(prod, nullable, newProds);
}
auto result = cfg;
result.productions.assign(newProds.begin(), newProds.end());
return result;
}
CFG epsilonNormalFormOf(const CFG& cfg) {
return epsilonNormalFormOf(cfg, nullablesOf(cfg));
}
/* Given a CFG in epsilon normal form, returns a new CFG such that if there any
* unit productions, they form a DAG.
*
* A unit production is one of the form A -> B. The basic idea is to replace all
* productions of the form A -> B by finding productions of the form B -> alpha
* and then directly writing A -> alpha, replacing the A -> B production.
*
* The challenge here is that we can have chains of nonterminals, such as
* A -> B -> C, where A needs to absorb items out of C.
*
* Making this more complicated, we also have to worry about chains of
* the form A -> B -> A, in which we get a cycle from a nonterminal back to
* itself. In that case, A needs to take on B's productions, and B needs to take
* on A's productions as well. (This includes a special case of A -> A, nonterminals
* that immediately produce themselves.)
*
* The most general version of this problem goes like this: we have a graph where
* each node is a nonterminal and there's an edge from A to B if A -> B is a unit
* production. We need to duplicate productions while also respecting cycles.
*
* Our approach is to use a strongly-connected-components algorithm to find SCC's of
* nonterminals that can produce one another. We'll then process them in reverse
* topological order (deepest first, topmost last). We'll then pick one representative
* of each of the nonterminals in an SCC and combine all the productions together
* under that representative.
*
* The resulting graph may still have unit productions in it, but those units will
* form a DAG, which is all we need.
*/
using Graph = map<char32_t, set<char32_t>>;
/* Returns whether something is a unit production. */
bool isNonterminalUnit(const Production& p) {
return p.replacement.size() == 1 && p.replacement[0].type == Symbol::Type::NONTERMINAL;
}
bool isTerminalUnit(const Production& p) {
return p.replacement.size() == 1 && p.replacement[0].type == Symbol::Type::TERMINAL;
}
/* Constructs the graph where A -> B is an edge if A -> B is a production. */
Graph unitGraphOf(const CFG& cfg) {
Graph result;
/* Ensure a node for each nonterminal. */
for (char32_t nonterminal: cfg.nonterminals) {
(void) result[nonterminal];
}
for (const auto& p: cfg.productions) {
if (isNonterminalUnit(p)) {
/* Insert both nodes and an edge one way. */
result[p.nonterminal].insert(p.replacement[0].ch);
}
}
return result;
}
/* Runs a DFS, recording the finishing times in the output vector. */
void dfs(char32_t nonterminal, const Graph& graph, vector<char32_t>& order, set<char32_t>& visited) {
if (!visited.insert(nonterminal).second) return;
for (char32_t next: graph.at(nonterminal)) {
dfs(next, graph, order, visited);
}
order.push_back(nonterminal);
}
/* Given a graph, reverses that graph. */
Graph reverseOf(const Graph& input) {
Graph result;
for (const auto& src: input) {
/* Clone nodes. */
(void) result[src.first];
for (char32_t dst: src.second) {
result[dst].insert(src.first);
}
}
return result;
}
/* Returns the SCCs of a given graph in reverse topological order. This uses Kosaraju's
* SCC algorithm.
*/
vector<vector<char32_t>> sccsOf(const Graph& graph) {
/* Work with the reverse of the graph so that we find SCCs in reverse topological order. */
Graph rev = reverseOf(graph);
/* Run the inital DFS. */
vector<char32_t> order;
set<char32_t> visited;
for (const auto& entry: rev) {
dfs(entry.first, rev, order, visited);
}
/* Run the reverse DFS's. */
vector<vector<char32_t>> result;
reverse(order.begin(), order.end());
visited.clear();
for (char32_t nonterminal: order) {
vector<char32_t> scc;
dfs(nonterminal, graph, scc, visited);
/* SCC can be empty if the node was already visited. */
if (!scc.empty()) result.push_back(std::move(scc));
}
return result;
}
CFG unitNormalForm(const CFG& cfg) {
/* Map from each nonterminal to its representative. */
map<char32_t, char32_t> reps;
/* Loop over SCCs, combining productions together. */
for (const auto& scc: sccsOf(unitGraphOf(cfg))) {
/* The first becomes the representative for all of these. */
auto rep = scc.front();
for (char32_t nonterminal: scc) {
reps[nonterminal] = rep;
}
}
/* Rebuild the productions list. */
set<Production> productions;
for (const auto& p: cfg.productions) {
/* Replace all nonterminals with their representatives. */
auto newP = p;
newP.nonterminal = reps.at(newP.nonterminal);
for (auto& s: newP.replacement) {
if (s.type == Symbol::Type::NONTERMINAL) {
s.ch = reps.at(s.ch);
}
}
/* If this is a unit, include it if it's not a loop. */
if (!isNonterminalUnit(newP) || newP.replacement[0].ch != newP.nonterminal) {