Skip to content

Commit 53c9ff7

Browse files
jcrussellqkaiser
authored andcommitted
feat(handler): add support for MSI files
Extracts MSIs using 7z with custom CFBF header parsing to compute the full archive size. Works on both vanilla and padded MSI files. This could be migrated to a fully Python-based implementation in the future using: * https://github.com/nightlark/pymsi * https://github.com/decalage2/olefile As of v0.47, olefile does not handle padded MSIs properly so we re-implement CFBF header parsing and compute the archive size ourselves. Implement a complete Compound File FAT traversal: parse header, collect every FAT sector by following the DIFAT chain, read each FAT sector, and locate the highest allocated sector so we can compute MSI chunk size even for large archives exceeding the 109 header FAT entries.
1 parent 9e13e9d commit 53c9ff7

File tree

353 files changed

+1337
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

353 files changed

+1337
-1
lines changed

docs/handlers.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
| [`LZIP`](#lzip) | COMPRESSION | :octicons-check-16: |
4141
| [`LZMA`](#lzma) | COMPRESSION | :octicons-check-16: |
4242
| [`LZO`](#lzo) | COMPRESSION | :octicons-check-16: |
43+
| [`MSI`](#msi) | ARCHIVE | :octicons-alert-fill-12: |
4344
| [`MULTI-SEVENZIP`](#multi-sevenzip) | ARCHIVE | :octicons-check-16: |
4445
| [`NETGEAR CHK`](#netgear-chk) | ARCHIVE | :octicons-check-16: |
4546
| [`NETGEAR TRX V1`](#netgear-trx-v1) | ARCHIVE | :octicons-check-16: |
@@ -718,6 +719,28 @@
718719

719720
- [LZO File Format Documentation](http://www.lzop.org/){ target="_blank" }
720721
- [LZO Wikipedia](https://en.wikipedia.org/wiki/Lempel%E2%80%93Ziv%E2%80%93Oberhumer){ target="_blank" }
722+
## MSI
723+
724+
!!! warning "Partially supported"
725+
726+
=== "Description"
727+
728+
Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.
729+
730+
---
731+
732+
- **Handler type:** Archive
733+
- **Vendor:** Microsoft
734+
735+
=== "References"
736+
737+
- [MSI File Format Documentation](https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer){ target="_blank" }
738+
- [Compound File Binary Format](https://en.wikipedia.org/wiki/Compound_File_Binary_Format){ target="_blank" }
739+
740+
=== "Limitations"
741+
742+
- Limited to CFB based extraction, not full-on MSI extraction
743+
- Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer
721744
## multi-sevenzip
722745

723746
!!! success "Fully supported"

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
import io
2+
import struct
3+
from typing import Optional
4+
5+
from structlog import get_logger
6+
7+
from unblob.extractors import Command
8+
9+
from ...file_utils import InvalidInputFormat
10+
from ...models import (
11+
File,
12+
HandlerDoc,
13+
HandlerType,
14+
HexString,
15+
Reference,
16+
StructHandler,
17+
ValidChunk,
18+
)
19+
20+
FREE_SECTOR = 0xFFFFFFFF
21+
END_OF_CHAIN = 0xFFFFFFFE
22+
HEADER_SIZE = 512
23+
24+
logger = get_logger()
25+
26+
27+
class MsiHandler(StructHandler):
28+
NAME = "msi"
29+
30+
PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
31+
C_DEFINITIONS = r"""
32+
typedef struct cfbf_header
33+
{
34+
// [offset from start (bytes), length (bytes)]
35+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
36+
// 0x1a, 0xe1} for current version
37+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
38+
// GetClassFile uses root directory class id)
39+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
40+
// written by reference implementation
41+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
42+
// 512-byte sectors, 4 for 4 KB sectors
43+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
44+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
45+
// typically 9 indicating 512-byte sectors
46+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
47+
// typically 6 indicating 64-byte mini-sectors
48+
uint16 reserved; // [22H,02] reserved, must be zero
49+
uint32 reserved1; // [24H,04] reserved, must be zero
50+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
51+
// number of SECTs in directory chain for 4 KB
52+
// sectors
53+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
54+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
55+
uint32 txSignature; // [34H,04] signature used for transactions; must
56+
// be zero. The reference implementation
57+
// does not support transactions
58+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
59+
// typically 4096 bytes
60+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
61+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
62+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
63+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
64+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
65+
} cfbf_header_t;
66+
"""
67+
HEADER_STRUCT = "cfbf_header_t"
68+
69+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
70+
71+
DOC = HandlerDoc(
72+
name="MSI",
73+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
74+
handler_type=HandlerType.ARCHIVE,
75+
vendor="Microsoft",
76+
references=[
77+
Reference(
78+
title="MSI File Format Documentation",
79+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
80+
),
81+
Reference(
82+
title="Compound File Binary Format",
83+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
84+
),
85+
],
86+
limitations=[
87+
"Limited to CFB based extraction, not full-on MSI extraction",
88+
"Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer",
89+
],
90+
)
91+
92+
def _read_sector(
93+
self, file: File, start_offset: int, sector_size: int, sector_id: int
94+
) -> bytes:
95+
# All sectors, including the fixed-size header, occupy full sector_size
96+
sector_offset = start_offset + sector_size + sector_id * sector_size
97+
if sector_offset > file.size():
98+
raise InvalidInputFormat("Invalid MSI file, sector offset too large")
99+
100+
file.seek(sector_offset, io.SEEK_SET)
101+
raw_sector = file.read(sector_size)
102+
if len(raw_sector) != sector_size:
103+
raise InvalidInputFormat("Invalid MSI file, sector shorter than expected")
104+
105+
return raw_sector
106+
107+
def _append_fat_sector(
108+
self, fat_sectors: list[int], sector_id: int, required_count: int
109+
) -> bool:
110+
if sector_id == FREE_SECTOR:
111+
return False
112+
113+
fat_sectors.append(sector_id)
114+
return len(fat_sectors) >= required_count
115+
116+
def _extend_fat_from_difat(
117+
self,
118+
file: File,
119+
header,
120+
start_offset: int,
121+
sector_size: int,
122+
entries_per_sector: int,
123+
fat_sectors: list[int],
124+
) -> None:
125+
difat_sector = header.sectDifStart
126+
127+
for _ in range(header.csectDif):
128+
if difat_sector in (FREE_SECTOR, END_OF_CHAIN):
129+
break
130+
131+
raw_sector = self._read_sector(
132+
file, start_offset, sector_size, difat_sector
133+
)
134+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
135+
136+
difat_sector = entries[-1]
137+
for entry in entries[:-1]:
138+
if self._append_fat_sector(
139+
fat_sectors, entry, required_count=header.csectFat
140+
):
141+
return
142+
143+
def _collect_fat_sectors(
144+
self,
145+
file: File,
146+
header,
147+
start_offset: int,
148+
sector_size: int,
149+
entries_per_sector: int,
150+
) -> list[int]:
151+
fat_sectors: list[int] = []
152+
153+
for sect in header.sectFat:
154+
if self._append_fat_sector(fat_sectors, sect, header.csectFat):
155+
return fat_sectors
156+
157+
if len(fat_sectors) < header.csectFat:
158+
self._extend_fat_from_difat(
159+
file, header, start_offset, sector_size, entries_per_sector, fat_sectors
160+
)
161+
162+
if len(fat_sectors) != header.csectFat:
163+
raise InvalidInputFormat("Invalid MSI file, incomplete FAT chain")
164+
165+
return fat_sectors
166+
167+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
168+
file.seek(start_offset, io.SEEK_SET)
169+
header = self.parse_header(file)
170+
171+
sector_size = 2**header.sectorShift
172+
entries_per_sector = sector_size // 4
173+
174+
if sector_size < HEADER_SIZE:
175+
raise InvalidInputFormat("Invalid MSI file, sector smaller than header")
176+
177+
if header.csectFat == 0:
178+
raise InvalidInputFormat("Invalid MSI file, FAT chain is empty")
179+
180+
fat_sectors = self._collect_fat_sectors(
181+
file, header, start_offset, sector_size, entries_per_sector
182+
)
183+
184+
max_used_sector = 0
185+
for sector_index, sect in enumerate(fat_sectors):
186+
raw_sector = self._read_sector(file, start_offset, sector_size, sect)
187+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
188+
189+
base_sector_id = sector_index * entries_per_sector
190+
for entry_id in range(len(entries) - 1, -1, -1):
191+
if entries[entry_id] == FREE_SECTOR:
192+
continue
193+
194+
max_id = base_sector_id + entry_id
195+
max_used_sector = max(max_used_sector, max_id)
196+
break
197+
198+
total_size = sector_size + ((max_used_sector + 1) * sector_size)
199+
200+
return ValidChunk(
201+
start_offset=start_offset,
202+
end_offset=start_offset + total_size,
203+
)

python/unblob/processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
57+
# Disabled for MSI files
58+
# "Composite Document File V2 Document",
5859
"Erlang BEAM file",
5960
"GIF",
6061
"GNU message catalog",

tests/handlers/archive/test_msi.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import struct
2+
3+
import pytest
4+
5+
from unblob.file_utils import File
6+
from unblob.handlers.archive.msi import (
7+
END_OF_CHAIN,
8+
FREE_SECTOR,
9+
MsiHandler,
10+
)
11+
12+
13+
def _build_msi_with_sector_shift(sector_shift: int) -> bytes:
14+
sector_size = 1 << sector_shift
15+
16+
header = bytearray(sector_size)
17+
header[:8] = bytes.fromhex("D0 CF 11 E0 A1 B1 1A E1")
18+
19+
dll_version = 4 if sector_shift >= 12 else 3
20+
# Offsets and values taken from the CFBF header specification
21+
struct.pack_into(
22+
"<HHHHHH",
23+
header,
24+
0x18,
25+
0x0033,
26+
dll_version,
27+
0xFFFE,
28+
sector_shift,
29+
6,
30+
0,
31+
)
32+
struct.pack_into("<I", header, 0x2C, 1) # csectFat
33+
struct.pack_into("<I", header, 0x38, 4096) # miniSectorCutoff
34+
struct.pack_into("<I", header, 0x3C, FREE_SECTOR) # sectMiniFatStart
35+
struct.pack_into("<I", header, 0x44, FREE_SECTOR) # sectDifStart
36+
37+
sect_fat_entries = [FREE_SECTOR] * 109
38+
sect_fat_entries[0] = 0
39+
for index, entry in enumerate(sect_fat_entries):
40+
struct.pack_into("<I", header, 0x4C + index * 4, entry)
41+
42+
entries_per_sector = sector_size // 4
43+
fat_sector = bytearray(sector_size)
44+
fat_entries = [END_OF_CHAIN] + [FREE_SECTOR] * (entries_per_sector - 1)
45+
for index, entry in enumerate(fat_entries):
46+
struct.pack_into("<I", fat_sector, index * 4, entry)
47+
48+
return bytes(header + fat_sector)
49+
50+
51+
@pytest.mark.parametrize("sector_shift", [9, 12])
52+
def test_calculate_chunk_respects_sector_size(sector_shift: int):
53+
handler = MsiHandler()
54+
55+
msi_content = _build_msi_with_sector_shift(sector_shift)
56+
prefix = b"prefix"
57+
file = File.from_bytes(prefix + msi_content)
58+
59+
chunk = handler.calculate_chunk(file, len(prefix))
60+
61+
assert chunk is not None
62+
assert chunk.start_offset == len(prefix)
63+
assert chunk.end_offset == len(prefix) + len(msi_content)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:dce9e456ace76b969fe0fe4d228bf096662c11d2376d99a9210f6364428a94c4
3+
size 1563648
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:da8f4120ab4ffacb19067a26f6a8b2695e00ec19bcc48ff694349c62df1b330b
3+
size 1563680
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa8e5036d973688f1e8622fbe9ab22e037346e0def0197bf5e7cdf37da4e223d
3+
size 3831808
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:12c87c542e1d4a39b47f176ffa5fd1691c98e5f9d502e6e46573962fb77c4510
3+
size 3831840
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16

0 commit comments

Comments
 (0)