Skip to content

Commit f8c702e

Browse files
committed
Initial commit
1 parent 5963e24 commit f8c702e

10 files changed

Lines changed: 508 additions & 0 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
.idea
2+
vendor
3+
composer.lock

README-screenshot.png

9.87 KB
Loading

README.md

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,109 @@
11
# php-diff
22
Text diff library for PHP
3+
4+
## Installation
5+
```cmd
6+
composer install yusitnikov/php-diff
7+
```
8+
9+
## Usage
10+
Create an instance of `LevenshteinDiffCalculator`. Arguments:
11+
1. `string $separatorRegex` - separator RegEx that will split the strings into units.
12+
By default, strings will be split into letters.
13+
2. `OperationCostCalculator $operationCostCalculator` - specify it to apply custom operation costs.
14+
By default, the cost for each operation (insert, delete, replace) will be 1.
15+
3. `StringDiffCalculatorInterface $itemDiffCalculator` - diff calculator that would be applied when replacing a unit.
16+
17+
### Example 1. Simple use-case - calculate distance and diff letter by letter.
18+
```php
19+
$s1 = 'Levenshtein';
20+
$s2 = 'Einstein';
21+
22+
$differ = new LevenshteinDiffCalculator();
23+
var_dump($differ->calcDistance($s1, $s2));
24+
var_dump($differ->calcDiff($s1, $s2));
25+
```
26+
27+
Output:
28+
```
29+
int(5)
30+
class Chameleon\PhpDiff\StringDiffResult#14 (2) {
31+
public $distance =>
32+
int(5)
33+
public $diff =>
34+
array(5) {
35+
[0] =>
36+
class Chameleon\PhpDiff\StringDiffOperation#12 (2) {
37+
public $operation =>
38+
string(6) "DELETE"
39+
public $content =>
40+
string(4) "Leve"
41+
}
42+
[1] =>
43+
class Chameleon\PhpDiff\StringDiffOperation#13 (2) {
44+
public $operation =>
45+
string(6) "INSERT"
46+
public $content =>
47+
string(2) "Ei"
48+
}
49+
[2] =>
50+
class Chameleon\PhpDiff\StringDiffOperation#7 (2) {
51+
public $operation =>
52+
string(5) "MATCH"
53+
public $content =>
54+
string(2) "ns"
55+
}
56+
[3] =>
57+
class Chameleon\PhpDiff\StringDiffOperation#5 (2) {
58+
public $operation =>
59+
string(6) "DELETE"
60+
public $content =>
61+
string(1) "h"
62+
}
63+
[4] =>
64+
class Chameleon\PhpDiff\StringDiffOperation#4 (2) {
65+
public $operation =>
66+
string(5) "MATCH"
67+
public $content =>
68+
string(4) "tein"
69+
}
70+
}
71+
}
72+
```
73+
74+
The corresponding diff:
75+
```diff
76+
- Leve
77+
+ Ei
78+
ns
79+
- h
80+
tein
81+
```
82+
83+
### Example 2. Complex differ - compare lines and words
84+
85+
```php
86+
$s1 = 'And now here is my secret,
87+
a very simple secret:
88+
it is only with the heart that one can see rightly,
89+
what is essential is invisible to the eye.
90+
91+
"The Little Prince", Antoine de Saint-Exupéry';
92+
93+
$s2 = '> So here is my secret,
94+
> its only with heart that somebody can see rightly,
95+
> what is essential is always invisible to the eye.
96+
> I can promise it to you.
97+
98+
- Le Petit Prince, Antoine Marie Jean-Baptiste Roger, comte de Saint-Exupéry';
99+
100+
$lineDiffer = new LevenshteinDiffCalculator(LevenshteinDiffCalculator::SPLIT_WORDS_REGEX);
101+
$textDiffer = new LevenshteinDiffCalculator(
102+
LevenshteinDiffCalculator::SPLIT_LINES_REGEX,
103+
(new OperationCostCalculator())->setReplaceDistanceCalculator($lineDiffer),
104+
$lineDiffer
105+
);
106+
$result = $textDiffer->calcDiff($s1, $s2);
107+
```
108+
109+
![Diff](README-screenshot.png)

composer.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"name": "yusitnikov/php-diff",
3+
"description": "Text diff library for PHP",
4+
"type": "library",
5+
"authors": [
6+
{
7+
"name": "Yura Sitnikov",
8+
"email": "y.sitnikoff@gmail.com"
9+
}
10+
],
11+
"license": "MIT",
12+
"require": {
13+
"php": "^7.0"
14+
},
15+
"minimum-stability": "stable",
16+
"autoload": {
17+
"psr-4": {
18+
"Chameleon\\PhpDiff\\": "src/"
19+
}
20+
}
21+
}

src/LevenshteinDiffCalculator.php

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
<?php
2+
3+
namespace Chameleon\PhpDiff;
4+
5+
// https://en.wikipedia.org/wiki/Levenshtein_distance
6+
class LevenshteinDiffCalculator implements StringDiffCalculatorInterface
7+
{
8+
const SPLIT_LINES_REGEX = '\n\r?';
9+
const SPLIT_WORDS_REGEX = '\W';
10+
const SPLIT_CHARS_REGEX = '';
11+
12+
/** @var int[][] */
13+
private $matrix;
14+
15+
/** @var OperationCostCalculator */
16+
private $operationCostCalculator;
17+
18+
/** @var string */
19+
private $separatorRegex;
20+
21+
/** @var StringDiffCalculatorInterface */
22+
private $itemDiffCalculator;
23+
24+
private $startMatch;
25+
private $endMatch;
26+
27+
public function __construct(
28+
$separatorRegex = self::SPLIT_CHARS_REGEX,
29+
OperationCostCalculator $operationCostCalculator = null,
30+
StringDiffCalculatorInterface $itemDiffCalculator = null
31+
)
32+
{
33+
$this->separatorRegex = $separatorRegex;
34+
$this->operationCostCalculator = $operationCostCalculator ?? new OperationCostCalculator();
35+
$this->itemDiffCalculator = $itemDiffCalculator;
36+
}
37+
38+
/**
39+
* @param string[]|string $s
40+
* @return string[]
41+
*/
42+
private function split($s)
43+
{
44+
if (is_array($s)) {
45+
return $s;
46+
} else {
47+
return preg_split('/(' . $this->separatorRegex . ')/u', $s, null, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
48+
}
49+
}
50+
51+
/**
52+
* @param string $s
53+
* @return bool
54+
*/
55+
private function isSeparator($s)
56+
{
57+
return $this->separatorRegex !== '' && preg_match('/^(?:' . $this->separatorRegex . ')+$/u', $s);
58+
}
59+
60+
/**
61+
* @param string $s1
62+
* @param string $s2
63+
* @param bool $keepMatrix
64+
* @return int
65+
*/
66+
private function _calcDistance($s1, $s2, $keepMatrix)
67+
{
68+
// normalize the input
69+
$s1 = $this->split($s1);
70+
$s2 = $this->split($s2);
71+
72+
// reset the matrix
73+
$m =& $this->matrix;
74+
$m = [];
75+
76+
// temporary variables
77+
$n1 = count($s1);
78+
$n2 = count($s2);
79+
80+
// check for perfect equality
81+
if ($s1 === $s2) {
82+
$this->startMatch = $n1;
83+
$this->endMatch = 0;
84+
return 0;
85+
}
86+
87+
// check for trailing equality
88+
for ($this->startMatch = 0; $this->startMatch < $n1 && $this->startMatch < $n2 && $s1[$this->startMatch] === $s2[$this->startMatch]; $this->startMatch++) { }
89+
if ($this->startMatch) {
90+
$s1 = array_slice($s1, $this->startMatch);
91+
$n1 -= $this->startMatch;
92+
$s2 = array_slice($s2, $this->startMatch);
93+
$n2 -= $this->startMatch;
94+
}
95+
for ($this->endMatch = 0; $this->endMatch < $n1 && $this->endMatch < $n2 && $s1[$n1 - 1 - $this->endMatch] === $s2[$n2 - 1 - $this->endMatch]; $this->endMatch++) { }
96+
if ($this->endMatch) {
97+
$s1 = array_slice($s1, 0, -$this->endMatch);
98+
$n1 -= $this->endMatch;
99+
$s2 = array_slice($s2, 0, -$this->endMatch);
100+
$n2 -= $this->endMatch;
101+
}
102+
103+
// init the first row
104+
$m[0][0] = 0;
105+
for ($i2 = 0; $i2 < $n2; $i2++) {
106+
$m[0][$i2 + 1] = $m[0][$i2] + $this->operationCostCalculator->getInsertCost($s2[$i2]);
107+
}
108+
109+
// calc the matrix row by row
110+
for ($i1 = 0; $i1 < $n1; $i1++) {
111+
$c1 = $s1[$i1];
112+
$deleteCost = $this->operationCostCalculator->getDeleteCost($c1);
113+
$m[$i1 + 1][0] = $m[$i1][0] + $deleteCost;
114+
115+
for ($i2 = 0; $i2 < $n2; $i2++) {
116+
$c2 = $s2[$i2];
117+
$insertCost = $this->operationCostCalculator->getInsertCost($c2);
118+
$replaceCost = $this->operationCostCalculator->getReplaceCost($c1, $c2);
119+
$m[$i1 + 1][$i2 + 1] = min(
120+
$m[$i1][$i2 + 1] + $deleteCost,
121+
$m[$i1 + 1][$i2] + $insertCost,
122+
$m[$i1][$i2] + $replaceCost
123+
);
124+
}
125+
126+
if (!$keepMatrix) {
127+
unset($m[$i1]);
128+
}
129+
}
130+
131+
$distance = $m[$n1][$n2];
132+
if (!$keepMatrix) {
133+
$m = [];
134+
}
135+
return $distance;
136+
}
137+
138+
/**
139+
* @param string $s1
140+
* @param string $s2
141+
* @return int
142+
*/
143+
public function calcDistance($s1, $s2)
144+
{
145+
return $this->_calcDistance($s1, $s2, false);
146+
}
147+
148+
/**
149+
* @param string $s1
150+
* @param string $s2
151+
* @return StringDiffResult
152+
*/
153+
public function calcDiff($s1, $s2)
154+
{
155+
// normalize the input
156+
$s1 = $this->split($s1);
157+
$s2 = $this->split($s2);
158+
159+
// calc the distance and the matrix
160+
$distance = $this->_calcDistance($s1, $s2, true);
161+
162+
// temporary variables
163+
$m = $this->matrix;
164+
$n1 = count($s1);
165+
$n2 = count($s2);
166+
167+
// check if calcDistance identified perfect or partial equality
168+
$startDiff = [];
169+
if ($this->startMatch) {
170+
$startDiff[] = new StringDiffOperation(StringDiffOperation::MATCH, implode('', array_slice($s1, 0, $this->startMatch)));
171+
$s1 = array_slice($s1, $this->startMatch);
172+
$n1 -= $this->startMatch;
173+
$s2 = array_slice($s2, $this->startMatch);
174+
$n2 -= $this->startMatch;
175+
}
176+
$endDiff = [];
177+
if ($this->endMatch) {
178+
$endDiff[] = new StringDiffOperation(StringDiffOperation::MATCH, implode('', array_slice($s1, -$this->endMatch)));
179+
$s1 = array_slice($s1, 0, -$this->endMatch);
180+
$n1 -= $this->endMatch;
181+
$s2 = array_slice($s2, 0, -$this->endMatch);
182+
$n2 -= $this->endMatch;
183+
}
184+
185+
$diff = [];
186+
$i1 = $n1;
187+
$i2 = $n2;
188+
while ($i1 || $i2) {
189+
$c1 = $i1 ? $s1[$i1 - 1] : null;
190+
$c2 = $i2 ? $s2[$i2 - 1] : null;
191+
$insertCost = $i2 ? $m[$i1][$i2 - 1] + $this->operationCostCalculator->getInsertCost($c2) : PHP_INT_MAX;
192+
$deleteCost = $i1 ? $m[$i1 - 1][$i2] + $this->operationCostCalculator->getDeleteCost($c1) : PHP_INT_MAX;
193+
$replaceCost = ($i1 && $i2) ? $m[$i1 - 1][$i2 - 1] + $this->operationCostCalculator->getReplaceCost($c1, $c2) : PHP_INT_MAX;
194+
// check who is the minimal
195+
// the order is important!
196+
switch (min($insertCost, $deleteCost, $replaceCost)) {
197+
case $insertCost:
198+
array_unshift($diff, new StringDiffOperation(StringDiffOperation::INSERT, $c2));
199+
--$i2;
200+
break;
201+
case $deleteCost:
202+
array_unshift($diff, new StringDiffOperation(StringDiffOperation::DELETE, $c1));
203+
--$i1;
204+
break;
205+
case $replaceCost:
206+
if ($c1 === $c2) {
207+
array_unshift($diff, new StringDiffOperation(StringDiffOperation::MATCH, $c1));
208+
} elseif ($this->itemDiffCalculator) {
209+
$diff = array_merge($this->itemDiffCalculator->calcDiff($c1, $c2)->diff, $diff);
210+
} else {
211+
array_unshift(
212+
$diff,
213+
new StringDiffOperation(StringDiffOperation::DELETE, $c1),
214+
new StringDiffOperation(StringDiffOperation::INSERT, $c2)
215+
);
216+
}
217+
--$i1;
218+
--$i2;
219+
break;
220+
}
221+
}
222+
223+
// free the matrix memory
224+
$this->matrix = [];
225+
226+
// merge, re-order and unify the diffs
227+
$diff = array_merge($startDiff, $diff, $endDiff);
228+
$mergedDiff = [];
229+
/** @var StringDiffOperation $item */
230+
foreach ($diff as $item) {
231+
$mn = count($mergedDiff);
232+
233+
$prevItem = $mergedDiff[$mn - 1] ?? new StringDiffOperation();
234+
$prevPrevItem = $mergedDiff[$mn - 2] ?? new StringDiffOperation();
235+
$prevPrevPrevItem = $mergedDiff[$mn - 3] ?? new StringDiffOperation();
236+
237+
if ($item->operation === $prevItem->operation) {
238+
$prevItem->content .= $item->content;
239+
} elseif ($item->operation !== StringDiffOperation::MATCH && $prevItem->operation !== StringDiffOperation::MATCH && $item->operation === $prevPrevItem->operation) {
240+
$prevPrevItem->content .= $item->content;
241+
} elseif ($item->operation === StringDiffOperation::DELETE && $prevItem->operation === StringDiffOperation::MATCH && $this->isSeparator($prevItem->content) && $prevPrevItem->operation === StringDiffOperation::INSERT && $prevPrevPrevItem->operation === StringDiffOperation::DELETE) {
242+
$prevPrevPrevItem->content .= $prevItem->content . $item->content;
243+
$prevPrevItem->content .= $prevItem->content;
244+
array_pop($mergedDiff);
245+
} else {
246+
$mergedDiff[] = $item;
247+
}
248+
}
249+
250+
return new StringDiffResult($distance, $mergedDiff);
251+
}
252+
}

0 commit comments

Comments
 (0)