-
Notifications
You must be signed in to change notification settings - Fork 6
Expand SQLite3 data validation #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
0e3eeb6
e62cf3c
3a99279
64470dd
4572811
f4b6ffb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -125,13 +125,72 @@ def __init__(self, wal: WAL, offset: int): | |
| def __repr__(self) -> str: | ||
| return f"<Frame page_number={self.page_number} page_count={self.page_count}>" | ||
|
|
||
| @property | ||
| def valid(self) -> bool: | ||
| def is_valid(self, validate_checksums: bool = True) -> bool: | ||
| """Return whether the frame is valid by comparing its salt values and optionally verifying the checksum. | ||
|
|
||
| A frame is valid if: | ||
| - Its salt1 and salt2 values match those in the WAL header. | ||
| - Its checksum matches the calculated checksum. | ||
|
|
||
| References: | ||
| - https://sqlite.org/fileformat2.html#wal_file_format | ||
| """ | ||
| return (self.is_valid_salt() and self.is_valid_checksum()) if validate_checksums else self.is_valid_salt() | ||
|
|
||
| def is_valid_salt(self) -> bool: | ||
| """Return whether the frame's salt values match those in the WAL header. | ||
|
|
||
| References: | ||
| - https://sqlite.org/fileformat2.html#wal_file_format | ||
| """ | ||
| salt1_match = self.header.salt1 == self.wal.header.salt1 | ||
| salt2_match = self.header.salt2 == self.wal.header.salt2 | ||
|
|
||
| return salt1_match and salt2_match | ||
|
|
||
| def is_valid_checksum(self) -> bool: | ||
| """Return whether the frame's checksum matches the calculated checksum. | ||
|
|
||
| The checksum values in the final 8 bytes of the frame-header (checksum-1 and checksum-2) | ||
| exactly match the computed checksum over: | ||
|
|
||
| 1. the first 24 bytes of the WAL header | ||
| 2. the first 8 bytes of each frame header (up to and including this frame) | ||
| 3. the page data of each frame (up to and including this frame) | ||
|
|
||
| References: | ||
| - https://sqlite.org/fileformat2.html#wal_file_format | ||
| - https://github.com/sqlite/sqlite/blob/master/src/wal.c#L995-L1047 | ||
| """ | ||
| # Start seed with checksum over first 24 bytes of WAL header | ||
| seed = calculate_checksum(self.header.dumps()[:24], endian=self.wal.checksum_endian) | ||
|
|
||
| # Iterate frames from the first frame up to and including this frame | ||
| frame_size = len(c_sqlite3.wal_frame) + self.wal.header.page_size | ||
| first_frame_offset = len(c_sqlite3.wal_header) | ||
| offset = first_frame_offset | ||
|
|
||
| while offset <= self.offset: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems wasteful to "throw away" the results for every frame we pass, while we may use it in the next frame checksum calculation. But I can't think of a super nice way to keep it. Caching on the Do you know exactly how the checksumming works if at any point in the middle of the WAL a checksum fails? You'd think that everything after it can never have a matching checksum again, unless future frames just ignore this fact and "checksum" the bad data as part of their checksummed data? If the former is true, might it be possible to just store the "highest offset" that we verified a good checksum of? Anything that is before that offset is an automatic
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ooh I like the way you're thinking, definitely going to look into this. It seems like a good way to significantly reduce the time it takes to checksum. |
||
| # Read frame header | ||
| self.fh.seek(offset) | ||
| frame_hdr_bytes = self.fh.read(len(c_sqlite3.wal_frame)) | ||
| if len(frame_hdr_bytes) < len(c_sqlite3.wal_frame): | ||
| raise EOFError("Incomplete frame header while calculating checksum") | ||
|
|
||
| # Checksum first 16 bytes of frame header | ||
| seed = calculate_checksum(frame_hdr_bytes[:16], seed=seed, endian=self.wal.checksum_endian) | ||
|
|
||
| # Read and checksum page data | ||
| page_data = self.fh.read(self.wal.header.page_size) | ||
| if len(page_data) < self.wal.header.page_size: | ||
| raise EOFError("Incomplete page data while calculating checksum") | ||
| seed = calculate_checksum(page_data, seed=seed, endian=self.wal.checksum_endian) | ||
|
|
||
| offset += frame_size | ||
|
|
||
| # Compare calculated checksum to stored checksum in this frame header | ||
| return (seed[0], seed[1]) == (self.header.checksum1, self.header.checksum2) | ||
|
|
||
| @property | ||
| def data(self) -> bytes: | ||
| self.fh.seek(self.offset + len(c_sqlite3.wal_frame)) | ||
|
|
@@ -187,8 +246,14 @@ class Commit(_FrameCollection): | |
| """ | ||
|
|
||
|
|
||
| def checksum(buf: bytes, endian: str = ">") -> tuple[int, int]: | ||
| s0 = s1 = 0 | ||
| def calculate_checksum(buf: bytes, seed: tuple[int, int] = (0, 0), endian: str = ">") -> tuple[int, int]: | ||
| """Calculate the checksum of a WAL header or frame. | ||
|
|
||
| References: | ||
| - https://sqlite.org/fileformat2.html#checksum_algorithm | ||
| """ | ||
|
|
||
| s0, s1 = seed | ||
| num_ints = len(buf) // 4 | ||
| arr = struct.unpack(f"{endian}{num_ints}I", buf) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.