Skip to content

Commit e810cad

Browse files
Add utility to compute check if strings are within certain edit distance
For now only basic levenshtein distance (delete,insert,modify). This is probably good enough for the intended use case (suggesting typo fixes on the command line).
1 parent 2bd790d commit e810cad

File tree

6 files changed

+426
-0
lines changed

6 files changed

+426
-0
lines changed

src/util/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ SRC = allocate_objects.cpp \
1111
cout_message.cpp \
1212
dstring.cpp \
1313
endianness_map.cpp \
14+
edit_distance.cpp \
1415
expr.cpp \
1516
expr_initializer.cpp \
1617
expr_util.cpp \

src/util/edit_distance.cpp

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/// \file
2+
/// \author Diffblue Ltd.
3+
///
4+
/// Provides a way to compute edit distance between two strings
5+
6+
#include "edit_distance.h"
7+
8+
levenshtein_automatont::levenshtein_automatont(
9+
const std::string &string,
10+
std::size_t allowed_errors)
11+
{
12+
const std::size_t layer_offset = string.size() + 1;
13+
for(std::size_t i = 0; i <= allowed_errors; ++i)
14+
{
15+
final_states.push_back(string.size() + layer_offset * i);
16+
}
17+
for(std::size_t string_index = 0; string_index < string.size();
18+
++string_index)
19+
{
20+
for(std::size_t error_layer = 0; error_layer <= allowed_errors;
21+
++error_layer)
22+
{
23+
// position string_index matches
24+
nfa.add_transition(
25+
error_layer * layer_offset + string_index,
26+
string[string_index],
27+
error_layer * layer_offset + string_index + 1);
28+
if(error_layer < allowed_errors)
29+
{
30+
// insertion, swap or deletion
31+
nfa.add_arbitrary_transition(
32+
error_layer * layer_offset + string_index,
33+
(error_layer + 1) * layer_offset + string_index);
34+
nfa.add_epsilon_transition(
35+
error_layer * layer_offset + string_index,
36+
(error_layer + 1) * layer_offset + string_index + 1);
37+
nfa.add_arbitrary_transition(
38+
error_layer * layer_offset + string_index,
39+
(error_layer + 1) * layer_offset + string_index + 1);
40+
}
41+
}
42+
}
43+
for(std::size_t error_layer = 0; error_layer < allowed_errors; ++error_layer)
44+
{
45+
// arbitrary transitions between error layers
46+
nfa.add_arbitrary_transition(
47+
error_layer * layer_offset + string.size(),
48+
(error_layer + 1) * layer_offset + string.size());
49+
}
50+
}
51+
52+
bool levenshtein_automatont::matches(const std::string &string) const
53+
{
54+
return get_edit_distance(string).has_value();
55+
}
56+
57+
optionalt<std::size_t>
58+
levenshtein_automatont::get_edit_distance(const std::string &string) const
59+
{
60+
auto current = nfa.initial_state(0);
61+
for(const auto c : string)
62+
{
63+
current = nfa.next_state(current, c);
64+
}
65+
for(std::size_t distance = 0; distance < final_states.size(); ++distance)
66+
{
67+
if(current.contains(final_states[distance]))
68+
{
69+
return distance;
70+
}
71+
}
72+
return nullopt;
73+
}

src/util/edit_distance.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/// \file
2+
/// \author Diffblue Ltd.
3+
///
4+
/// Loosely based on this blog post:
5+
/// http://blog.notdot.net/2010/07/Damn-Cool-Algorithms-Levenshtein-Automata
6+
/// Provides a way to compute edit distance between two strings
7+
///
8+
/// No conversion to DFA or other optimisations are done here because for our
9+
/// use case (i.e. suggestions for errors in command line specifications) this
10+
/// is fast enough without them.
11+
12+
#ifndef CPROVER_UTIL_EDIT_DISTANCE_H
13+
#define CPROVER_UTIL_EDIT_DISTANCE_H
14+
15+
#include "nfa.h"
16+
17+
#include <cstddef>
18+
#include <string>
19+
20+
#include <util/optional.h>
21+
22+
/// Simple automaton that can detect whether a string can be transformed into
23+
/// another with a limited number of deletions, insertions or substitutions.
24+
/// Not a very fast implementation, but should be good enough for small strings.
25+
struct levenshtein_automatont
26+
{
27+
private:
28+
nfat<char> nfa;
29+
using state_labelt = nfat<char>::state_labelt;
30+
std::vector<state_labelt> final_states;
31+
32+
public:
33+
levenshtein_automatont(
34+
const std::string &string,
35+
std::size_t allowed_errors = 2);
36+
37+
bool matches(const std::string &string) const;
38+
optionalt<std::size_t> get_edit_distance(const std::string &string) const;
39+
40+
void dump_automaton_dot_to(std::ostream &out)
41+
{
42+
nfa.dump_automaton_dot_to(out);
43+
};
44+
};
45+
46+
#endif // CPROVER_UTIL_EDIT_DISTANCE_H

src/util/nfa.h

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
/// \file
2+
/// \author Diffblue Ltd.
3+
///
4+
/// A simple NFA implementation
5+
6+
#ifndef CPROVER_UTIL_NFA_H
7+
#define CPROVER_UTIL_NFA_H
8+
9+
#include <algorithm>
10+
#include <cstddef>
11+
#include <fstream>
12+
#include <iosfwd>
13+
#include <unordered_map>
14+
#include <unordered_set>
15+
#include <vector>
16+
17+
/// Very simple NFA implementation
18+
/// Not super performant, but should be good enough for our purposes
19+
template <typename T>
20+
struct nfat
21+
{
22+
using state_labelt = std::size_t;
23+
/// A state is a set of possibly active transitions.
24+
/// The automaton itself is stateless, it just contains
25+
/// the state transitions.
26+
struct statet
27+
{
28+
private:
29+
std::unordered_set<state_labelt> possible_states;
30+
31+
public:
32+
friend struct nfat;
33+
34+
bool contains(state_labelt state_label) const
35+
{
36+
return possible_states.count(state_label) != 0;
37+
}
38+
};
39+
40+
nfat() = default;
41+
42+
/// Add a transition from "from" to "to" exactly when "when" is consumed
43+
void add_transition(state_labelt from, T when, state_labelt to)
44+
{
45+
resize_if_necessary(from, to);
46+
transitions[from].when[when].insert(to);
47+
}
48+
49+
/// Add a transition from "from" to "to" when any input is consumed
50+
/// This is handled a special case so not all inputs need to be enumerated
51+
/// for arbitrary transitions
52+
void add_arbitrary_transition(state_labelt from, state_labelt to)
53+
{
54+
resize_if_necessary(from, to);
55+
transitions[from].arbitrary.insert(to);
56+
}
57+
58+
/// Add a transition from "from" to "to" that doesn’t consume input
59+
void add_epsilon_transition(state_labelt from, state_labelt to)
60+
{
61+
resize_if_necessary(from, to);
62+
transitions[from].epsilon.insert(to);
63+
}
64+
65+
statet initial_state(state_labelt initial) const
66+
{
67+
statet result{};
68+
result.possible_states.insert(initial);
69+
follow_epsilon_transitions(result);
70+
return result;
71+
}
72+
73+
statet next_state(const statet &current, const T &input) const
74+
{
75+
statet new_state{};
76+
for(const auto label : current.possible_states)
77+
{
78+
const auto &transition = transitions[label];
79+
const auto it = transition.when.find(input);
80+
if(it != transition.when.end())
81+
{
82+
for(const auto result_state : it->second)
83+
{
84+
new_state.possible_states.insert(result_state);
85+
}
86+
}
87+
for(const auto result_state : transition.arbitrary)
88+
{
89+
new_state.possible_states.insert(result_state);
90+
}
91+
}
92+
follow_epsilon_transitions(new_state);
93+
return new_state;
94+
}
95+
96+
/// Write the automaton structure to out in graphviz dot format.
97+
/// Meant for debugging.
98+
void dump_automaton_dot_to(std::ostream &out) const
99+
{
100+
out << "digraph {\n";
101+
for(state_labelt from = 0; from < transitions.size(); ++from)
102+
{
103+
for(const auto to : transitions[from].arbitrary)
104+
{
105+
out << 'S' << from << " -> S" << to << "[label=\"*\"]\n";
106+
}
107+
for(const auto to : transitions[from].epsilon)
108+
{
109+
out << 'S' << from << " -> S" << to << u8"[label=\"ε\"]\n";
110+
}
111+
for(const auto &pair : transitions[from].when)
112+
{
113+
const auto &in = pair.first;
114+
const auto &tos = pair.second;
115+
for(const auto to : tos)
116+
{
117+
out << 'S' << from << " -> S" << to << "[label=\"(" << in << ")\"]\n";
118+
}
119+
}
120+
}
121+
out << "}\n";
122+
}
123+
124+
private:
125+
void resize_if_necessary(state_labelt from, state_labelt to)
126+
{
127+
const auto min_size = std::max(from, to) + 1;
128+
if(transitions.size() < min_size)
129+
{
130+
transitions.resize(min_size);
131+
}
132+
}
133+
134+
void follow_epsilon_transitions(statet &state) const
135+
{
136+
std::vector<state_labelt> new_states{};
137+
do
138+
{
139+
new_states.clear();
140+
for(const auto from : state.possible_states)
141+
{
142+
for(const auto to : transitions[from].epsilon)
143+
{
144+
if(!state.contains(to))
145+
{
146+
new_states.push_back(to);
147+
}
148+
}
149+
}
150+
std::copy(
151+
new_states.begin(),
152+
new_states.end(),
153+
std::inserter(state.possible_states, state.possible_states.end()));
154+
} while(!new_states.empty());
155+
}
156+
157+
struct transitiont
158+
{
159+
std::unordered_set<state_labelt> epsilon;
160+
std::unordered_set<state_labelt> arbitrary;
161+
std::unordered_map<T, std::unordered_set<state_labelt>> when;
162+
};
163+
164+
std::vector<transitiont> transitions;
165+
};
166+
167+
#endif // CPROVER_UTIL_NFA_H

unit/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ SRC += analyses/ai/ai.cpp \
7373
util/allocate_objects.cpp \
7474
util/cmdline.cpp \
7575
util/dense_integer_map.cpp \
76+
util/edit_distance.cpp \
7677
util/expr_cast/expr_cast.cpp \
7778
util/expr.cpp \
7879
util/expr_iterator.cpp \

0 commit comments

Comments
 (0)