Skip to content
This repository was archived by the owner on Aug 7, 2025. It is now read-only.

Commit 9807f60

Browse files
committed
Rename several more variables and add code comments
The goal with these variables renamings is twofold: to clarify meaning of the values being used, and to be consistently named for ease of readability. For example, the "scan" variable stored the position in the new file where a match is searched for, and "pos" stored the position in the old file where a match is found. These variables are now named "new_pos" and "old_pos" to reflect that they both store position information, referencing the new and old files, respectively. Also, declare the renamed variables closer to where they are used, since the function is quite lengthy already, and we build the code with -std=gnu99, so this syntax is supported. Finally, add some (hopefully) helpful code comments to clarify the diff creation algorithm. The bspatch code in src/patch.c is better documented than the diff code, so I am starting with the diff code documentation. Signed-off-by: Patrick McCarty <patrick.mccarty@intel.com>
1 parent e27563e commit 9807f60

File tree

2 files changed

+142
-104
lines changed

2 files changed

+142
-104
lines changed

src/diff.c

Lines changed: 126 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -92,20 +92,20 @@ static int64_t matchlen(u_char *old, int64_t old_size, u_char *new,
9292
* Finds the longest matching array of bytes between the OLD and NEW file. The
9393
* old file is suffix-sorted; the suffix-sorted array is stored at I, and
9494
* indices to search between are indicated by ST (start) and EN (end). The
95-
* function does not return a value, but once a match is determined, POS is
95+
* function does not return a value, but once a match is determined, OLD_POS is
9696
* updated to the position of the match within OLD, and MAX_LEN is set to the
9797
* match length.
9898
*/
9999
static void search(int64_t *I, u_char *old, int64_t old_size,
100100
u_char *new, int64_t new_size, int64_t st, int64_t en,
101-
int64_t *pos, int64_t *max_len)
101+
int64_t *old_pos, int64_t *max_len)
102102
{
103103
int64_t x, y;
104104

105105
/* Initialize max_len for the binary search */
106106
if (st == 0 && en == old_size) {
107107
*max_len = matchlen(old, old_size, new, new_size);
108-
*pos = I[st];
108+
*old_pos = I[st];
109109
}
110110

111111
/* The binary search terminates here when "en" and "st" are adjacent
@@ -114,12 +114,12 @@ static void search(int64_t *I, u_char *old, int64_t old_size,
114114
x = matchlen(old + I[st], old_size - I[st], new, new_size);
115115
if (x > *max_len) {
116116
*max_len = x;
117-
*pos = I[st];
117+
*old_pos = I[st];
118118
}
119119
y = matchlen(old + I[en], old_size - I[en], new, new_size);
120120
if (y > *max_len) {
121121
*max_len = y;
122-
*pos = I[en];
122+
*old_pos = I[en];
123123
}
124124

125125
return;
@@ -134,14 +134,14 @@ static void search(int64_t *I, u_char *old, int64_t old_size,
134134
int64_t tmp = matchlen(oldoffset, length, new, length);
135135
if (tmp > *max_len) {
136136
*max_len = tmp;
137-
*pos = I[x];
137+
*old_pos = I[x];
138138
}
139139

140140
/* Determine how to continue the binary search */
141141
if (memcmp(oldoffset, new, length) < 0) {
142-
return search(I, old, old_size, new, new_size, x, en, pos, max_len);
142+
return search(I, old, old_size, new, new_size, x, en, old_pos, max_len);
143143
} else {
144-
return search(I, old, old_size, new, new_size, st, x, pos, max_len);
144+
return search(I, old, old_size, new, new_size, st, x, old_pos, max_len);
145145
}
146146
}
147147

@@ -388,14 +388,6 @@ int make_bsdiff_delta(char *old_filename, char *new_filename, char *delta_filena
388388
u_char *old_data, *new_data;
389389
int64_t old_size, new_size;
390390
int64_t *I, *V;
391-
int64_t scan;
392-
int64_t pos = 0;
393-
int64_t len;
394-
int64_t lastscan, lastpos, lastoffset;
395-
int64_t oldscore, scsc;
396-
int64_t s, Sf, lenf, Sb, lenb;
397-
int64_t overlap, Ss, lens;
398-
int64_t i;
399391
uint64_t cblen, dblen, eblen;
400392
u_char *cb, *db, *eb;
401393
struct stat new_stat;
@@ -607,103 +599,139 @@ int make_bsdiff_delta(char *old_filename, char *new_filename, char *delta_filena
607599
eblen = 0;
608600

609601
/* Compute the differences */
610-
scan = 0;
611-
len = 0;
612-
lastscan = 0;
613-
lastpos = 0;
614-
lastoffset = 0;
615-
while (scan < new_size) {
616-
oldscore = 0;
617-
618-
for (scsc = scan += len; scan < new_size; scan++) {
619-
search(I, old_data, old_size, new_data + scan, new_size - scan,
620-
0, old_size, &pos, &len);
621-
622-
for (; scsc < scan + len; scsc++) {
623-
if ((scsc + lastoffset < old_size) &&
624-
(old_data[scsc + lastoffset] == new_data[scsc])) {
625-
oldscore++;
602+
int64_t new_pos = 0;
603+
int64_t old_pos = 0;
604+
int64_t match_len = 0;
605+
int64_t last_new_pos = 0;
606+
int64_t last_old_pos = 0;
607+
int64_t last_offset = 0;
608+
while (new_pos < new_size) {
609+
// Find an exact match between old and new files, and require
610+
// that more than 8 of the matching bytes "mismatch" from the
611+
// previous exact match. A score (old_score) is used to track
612+
// how many bytes match starting from new_pos in new, and from
613+
// old_pos in the previous iteration.
614+
int64_t old_score = 0;
615+
int64_t new_peek;
616+
for (new_peek = new_pos += match_len; new_pos < new_size; new_pos++) {
617+
search(I, old_data, old_size, new_data + new_pos, new_size - new_pos,
618+
0, old_size, &old_pos, &match_len);
619+
620+
for (; new_peek < new_pos + match_len; new_peek++) {
621+
if ((new_peek + last_offset < old_size) &&
622+
(old_data[new_peek + last_offset] == new_data[new_peek])) {
623+
old_score++;
626624
}
627625
}
628626

629-
if (((len == oldscore) && (len != 0)) ||
630-
(len > oldscore + 8)) {
627+
if (((match_len == old_score) && (match_len != 0)) ||
628+
(match_len > old_score + 8)) {
631629
break;
632630
}
633631

634-
if ((scan + lastoffset < old_size) &&
635-
(old_data[scan + lastoffset] == new_data[scan])) {
636-
oldscore--;
632+
// Before beginning the next loop iteration, decrement
633+
// old_score if needed, since new_pos will be
634+
// incremented.
635+
if ((new_pos + last_offset < old_size) &&
636+
(old_data[new_pos + last_offset] == new_data[new_pos])) {
637+
old_score--;
637638
}
638639
}
639640

640-
if ((len != oldscore) || (scan == new_size)) {
641-
s = 0;
642-
Sf = 0;
643-
lenf = 0;
644-
for (i = 0;
645-
(lastscan + i < scan) && (lastpos + i < old_size);) {
646-
if (old_data[lastpos + i] == new_data[lastscan + i]) {
647-
s++;
641+
if ((match_len != old_score) || (new_pos == new_size)) {
642+
int64_t bytes = 0, max = 0;
643+
// Compute the length of a fuzzy match starting from
644+
// the beginning of the fuzzy match recorded at the end
645+
// of the previous iteration (i.e. len_fuzzybackward
646+
// less than the previous match positions). At least
647+
// half of the bytes match between old and new. This
648+
// fuzzy match will be used to construct a diff string
649+
// in the diff block.
650+
int64_t len_fuzzyforward = 0;
651+
for (int64_t i = 0;
652+
(last_new_pos + i < new_pos) && (last_old_pos + i < old_size);) {
653+
if (old_data[last_old_pos + i] == new_data[last_new_pos + i]) {
654+
bytes++;
648655
}
649656
i++;
650-
if (s * 2 - i > Sf * 2 - lenf) {
651-
Sf = s;
652-
lenf = i;
657+
if (bytes * 2 - i > max * 2 - len_fuzzyforward) {
658+
max = bytes;
659+
len_fuzzyforward = i;
653660
}
654661
}
655662

656-
lenb = 0;
657-
if (scan < new_size) {
658-
s = 0;
659-
Sb = 0;
660-
for (i = 1;
661-
(scan >= lastscan + i) && (pos >= i);
663+
// Compute the length of a fuzzy match ending at the
664+
// current positions in old and new files (old_pos and
665+
// new_pos). At least half of the bytes match between
666+
// old and new. This fuzzy match will be used for the
667+
// next iteration.
668+
int64_t len_fuzzybackward = 0;
669+
if (new_pos < new_size) {
670+
bytes = 0;
671+
max = 0;
672+
for (int64_t i = 1;
673+
(new_pos >= last_new_pos + i) && (old_pos >= i);
662674
i++) {
663-
if (old_data[pos - i] == new_data[scan - i]) {
664-
s++;
675+
if (old_data[old_pos - i] == new_data[new_pos - i]) {
676+
bytes++;
665677
}
666-
if (s * 2 - i > Sb * 2 - lenb) {
667-
Sb = s;
668-
lenb = i;
678+
if (bytes * 2 - i > max * 2 - len_fuzzybackward) {
679+
max = bytes;
680+
len_fuzzybackward = i;
669681
}
670682
}
671683
}
672684

673-
if (lastscan + lenf > scan - lenb) {
674-
overlap = (lastscan + lenf) - (scan - lenb);
675-
s = 0;
676-
Ss = 0;
677-
lens = 0;
678-
for (i = 0; i < overlap; i++) {
679-
if (new_data[lastscan + lenf - overlap + i] ==
680-
old_data[lastpos + lenf - overlap + i]) {
681-
s++;
685+
// If there is an overlap between len_fuzzyforward and
686+
// len_fuzzybackward in the new file, that overlap must
687+
// be eliminated.
688+
if (last_new_pos + len_fuzzyforward > new_pos - len_fuzzybackward) {
689+
bytes = 0;
690+
max = 0;
691+
int64_t overlap = (last_new_pos + len_fuzzyforward) - (new_pos - len_fuzzybackward);
692+
int64_t len_fuzzyshift = 0;
693+
// Scan the overlap area for differences
694+
// between old and new. If any mismatching
695+
// bytes are found, extend len_fuzzyforward to
696+
// cover those bytes, because we want them
697+
// included in the diff block.
698+
for (int64_t i = 0; i < overlap; i++) {
699+
if (new_data[last_new_pos + len_fuzzyforward - overlap + i] ==
700+
old_data[last_old_pos + len_fuzzyforward - overlap + i]) {
701+
bytes++;
682702
}
683-
if (new_data[scan - lenb + i] ==
684-
old_data[pos - lenb + i]) {
685-
s--;
703+
if (new_data[new_pos - len_fuzzybackward + i] ==
704+
old_data[old_pos - len_fuzzybackward + i]) {
705+
bytes--;
686706
}
687-
if (s > Ss) {
688-
Ss = s;
689-
lens = i + 1;
707+
if (bytes > max) {
708+
max = bytes;
709+
len_fuzzyshift = i + 1;
690710
}
691711
}
692712

693-
lenf += lens - overlap;
694-
lenb -= lens;
713+
len_fuzzyforward += len_fuzzyshift - overlap;
714+
len_fuzzybackward -= len_fuzzyshift;
695715
}
696716

697-
for (i = 0; i < lenf; i++) {
717+
// Set the diff string in the diff block. For each byte
718+
// in the fuzzy forward region, the byte from old is
719+
// subtracted from new. When applying the delta (with
720+
// bspatch) this operation is reversed, by performing
721+
// additions.
722+
for (int64_t i = 0; i < len_fuzzyforward; i++) {
698723
db[dblen + i] =
699-
new_data[lastscan + i] - old_data[lastpos + i];
724+
new_data[last_new_pos + i] - old_data[last_old_pos + i];
700725
}
701-
for (i = 0; i < (scan - lenb) - (lastscan + lenf); i++) {
702-
eb[eblen + i] = new_data[lastscan + lenf + i];
726+
// Set the extra string in the extra block. The
727+
// contents are the bytes in new file between the fuzzy
728+
// forward and fuzzy backward regions.
729+
for (int64_t i = 0; i < (new_pos - len_fuzzybackward) - (last_new_pos + len_fuzzyforward); i++) {
730+
eb[eblen + i] = new_data[last_new_pos + len_fuzzyforward + i];
703731
}
704732

705-
dblen += lenf;
706-
eblen += (scan - lenb) - (lastscan + lenf);
733+
dblen += len_fuzzyforward;
734+
eblen += (new_pos - len_fuzzybackward) - (last_new_pos + len_fuzzyforward);
707735

708736
/* checking for control block overflow...
709737
* See regression test #15 for an example */
@@ -717,18 +745,28 @@ int make_bsdiff_delta(char *old_filename, char *new_filename, char *delta_filena
717745
return -1;
718746
}
719747

720-
offtout(lenf, cb + cblen);
748+
// Set three values in the control block:
749+
// 1. ADD instruction (value: length of the diff
750+
// string). It uses the offset of the third control
751+
// block value from the previous iteration.
752+
// 2. INSERT instruction (value: length of the extra
753+
// string)
754+
// 3. offset in old file for the next ADD instruction
755+
offtout(len_fuzzyforward, cb + cblen);
721756
cblen += 8;
722757

723-
offtout((scan - lenb) - (lastscan + lenf), cb + cblen);
758+
offtout((new_pos - len_fuzzybackward) - (last_new_pos + len_fuzzyforward), cb + cblen);
724759
cblen += 8;
725760

726-
offtout((pos - lenb) - (lastpos + lenf), cb + cblen);
761+
offtout((old_pos - len_fuzzybackward) - (last_old_pos + len_fuzzyforward), cb + cblen);
727762
cblen += 8;
728763

729-
lastscan = scan - lenb;
730-
lastpos = pos - lenb;
731-
lastoffset = pos - scan;
764+
// Save old/new file positions to the beginning of the
765+
// fuzzy backward region, since the next fuzzy forward
766+
// region will be calculated from that point.
767+
last_new_pos = new_pos - len_fuzzybackward;
768+
last_old_pos = old_pos - len_fuzzybackward;
769+
last_offset = old_pos - new_pos;
732770
}
733771
}
734772
free(I);

0 commit comments

Comments
 (0)