Skip to content

Commit a1b6c90

Browse files
jcrussellqkaiser
authored andcommitted
feat(handler): add support for MSI files
Extracts MSIs using 7z with custom CFBF header parsing to compute the full archive size. Works on both vanilla and padded MSI files. This could be migrated to a fully Python-based implementation in the future using: * https://github.com/nightlark/pymsi * https://github.com/decalage2/olefile As of v0.47, olefile does not handle padded MSIs properly so we re-implement CFBF header parsing and compute the archive size ourselves.
1 parent 9e13e9d commit a1b6c90

File tree

352 files changed

+1201
-1
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

352 files changed

+1201
-1
lines changed

docs/handlers.md

Lines changed: 23 additions & 0 deletions

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import io
2+
import struct
3+
from typing import Optional
4+
5+
from structlog import get_logger
6+
7+
from unblob.extractors import Command
8+
9+
from ...file_utils import InvalidInputFormat
10+
from ...models import (
11+
File,
12+
HandlerDoc,
13+
HandlerType,
14+
HexString,
15+
Reference,
16+
StructHandler,
17+
ValidChunk,
18+
)
19+
20+
logger = get_logger()
21+
22+
23+
class MsiHandler(StructHandler):
24+
NAME = "msi"
25+
26+
PATTERNS = [HexString("D0 CF 11 E0 A1 B1 1A E1")]
27+
C_DEFINITIONS = r"""
28+
typedef struct cfbf_header
29+
{
30+
// [offset from start (bytes), length (bytes)]
31+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
32+
// 0x1a, 0xe1} for current version
33+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
34+
// GetClassFile uses root directory class id)
35+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
36+
// written by reference implementation
37+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
38+
// 512-byte sectors, 4 for 4 KB sectors
39+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
40+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
41+
// typically 9 indicating 512-byte sectors
42+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
43+
// typically 6 indicating 64-byte mini-sectors
44+
uint16 reserved; // [22H,02] reserved, must be zero
45+
uint32 reserved1; // [24H,04] reserved, must be zero
46+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
47+
// number of SECTs in directory chain for 4 KB
48+
// sectors
49+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
50+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
51+
uint32 txSignature; // [34H,04] signature used for transactions; must
52+
// be zero. The reference implementation
53+
// does not support transactions
54+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
55+
// typically 4096 bytes
56+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
57+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
58+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
59+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
60+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
61+
} cfbf_header_t;
62+
"""
63+
HEADER_STRUCT = "cfbf_header_t"
64+
65+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
66+
67+
DOC = HandlerDoc(
68+
name="MSI",
69+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
70+
handler_type=HandlerType.ARCHIVE,
71+
vendor="Microsoft",
72+
references=[
73+
Reference(
74+
title="MSI File Format Documentation",
75+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
76+
),
77+
Reference(
78+
title="Compound File Binary Format",
79+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
80+
),
81+
],
82+
limitations=[
83+
"Limited to CFB based extraction, not full-on MSI extraction",
84+
"Extracted files have names coming from CFB internal representation, and may not correspond to the one they would have on disk after running the installer",
85+
],
86+
)
87+
88+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
89+
file.seek(start_offset, io.SEEK_SET)
90+
header = self.parse_header(file)
91+
92+
# Size of MSI is based on the maximum used sector. Need to walk the
93+
# DIFAT entries and find the maximum used sector to compute the size.
94+
sector_size = 2**header.sectorShift
95+
entries_per_sector = sector_size // 4
96+
97+
max_used_sector = 0
98+
99+
for sector_id, sect in enumerate(header.sectFat):
100+
# skip empty
101+
if sect == 0xFFFFFFFF:
102+
continue
103+
104+
sector_offset = start_offset + 512 + sect * sector_size
105+
if sector_offset > file.size():
106+
raise InvalidInputFormat("Invalid MSI file, sector offset too large")
107+
file.seek(sector_offset, io.SEEK_SET)
108+
raw_sector = file.read(sector_size)
109+
entries = struct.unpack(f"<{entries_per_sector}I", raw_sector)
110+
111+
base_sector_id = sector_id * entries_per_sector
112+
for entry_id in range(len(entries) - 1, -1, -1):
113+
if entries[entry_id] == 0xFFFFFFFF:
114+
continue
115+
116+
# Found the highest id on this page
117+
max_id = base_sector_id + entry_id
118+
119+
max_used_sector = max(max_used_sector, max_id)
120+
121+
# Once we have found the first non-empty element, we are done
122+
# with all IDs in this sector
123+
break
124+
125+
total_size = 512 + ((max_used_sector + 1) * sector_size)
126+
127+
return ValidChunk(
128+
start_offset=start_offset,
129+
end_offset=start_offset + total_size,
130+
)

python/unblob/processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
57+
# Disabled for MSI files
58+
# "Composite Document File V2 Document",
5859
"Erlang BEAM file",
5960
"GIF",
6061
"GNU message catalog",
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:dce9e456ace76b969fe0fe4d228bf096662c11d2376d99a9210f6364428a94c4
3+
size 1563648
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:da8f4120ab4ffacb19067a26f6a8b2695e00ec19bcc48ff694349c62df1b330b
3+
size 1563680
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:aa8e5036d973688f1e8622fbe9ab22e037346e0def0197bf5e7cdf37da4e223d
3+
size 3831808
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:12c87c542e1d4a39b47f176ffa5fd1691c98e5f9d502e6e46573962fb77c4510
3+
size 3831840
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:374708fff7719dd5979ec875d56cd2286f6d3cf7ec317a3b25632aab28ec37bb
3+
size 16

0 commit comments

Comments
 (0)