Skip to content

Commit 551bc49

Browse files
committed
feat(handler): add support for MSI files
Seems to work but had to DIY the parsing due to issues with olefile.
1 parent af55495 commit 551bc49

File tree

3 files changed

+143
-1
lines changed

3 files changed

+143
-1
lines changed

python/unblob/handlers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
cab,
77
cpio,
88
dmg,
9+
msi,
910
par2,
1011
partclone,
1112
rar,
@@ -89,6 +90,7 @@
8990
arc.ARCHandler,
9091
arj.ARJHandler,
9192
cab.CABHandler,
93+
msi.MsiHandler,
9294
tar.TarUstarHandler,
9395
tar.TarUnixHandler,
9496
cpio.PortableASCIIHandler,
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
"""MSI Handler
2+
3+
Extracts uses 7z but could migrate to a fully Python-based implementation:
4+
5+
https://github.com/nightlark/pymsi
6+
https://github.com/decalage2/olefile
7+
8+
As of v0.47, olefile does not handle padded MSIs properly so we re-implement
9+
CFBF header parsing and compute the archive size ourselves.
10+
"""
11+
12+
import io
13+
import struct
14+
from typing import Optional
15+
16+
from structlog import get_logger
17+
18+
from unblob.extractors import Command
19+
20+
from ...file_utils import OffsetFile
21+
from ...models import (
22+
File,
23+
HandlerDoc,
24+
HandlerType,
25+
HexString,
26+
Reference,
27+
StructHandler,
28+
ValidChunk,
29+
)
30+
31+
logger = get_logger()
32+
33+
34+
class MsiHandler(StructHandler):
35+
NAME = "msi"
36+
37+
PATTERNS = [
38+
HexString("D0 CF 11 E0 A1 B1 1A E1")
39+
]
40+
C_DEFINITIONS = r"""
41+
typedef struct cfbf_header
42+
{
43+
// [offset from start (bytes), length (bytes)]
44+
uint8 signature[8]; // [00H,08] {0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1,
45+
// 0x1a, 0xe1} for current version
46+
uint8 clsid[16]; // [08H,16] reserved must be zero (WriteClassStg/
47+
// GetClassFile uses root directory class id)
48+
uint16 minorVersion; // [18H,02] minor version of the format: 33 is
49+
// written by reference implementation
50+
uint16 dllVersion; // [1AH,02] major version of the dll/format: 3 for
51+
// 512-byte sectors, 4 for 4 KB sectors
52+
uint16 byteOrder; // [1CH,02] 0xFFFE: indicates Intel byte-ordering
53+
uint16 sectorShift; // [1EH,02] size of sectors in power-of-two;
54+
// typically 9 indicating 512-byte sectors
55+
uint16 miniSectorShift; // [20H,02] size of mini-sectors in power-of-two;
56+
// typically 6 indicating 64-byte mini-sectors
57+
uint16 reserved; // [22H,02] reserved, must be zero
58+
uint32 reserved1; // [24H,04] reserved, must be zero
59+
uint32 csectDir; // [28H,04] must be zero for 512-byte sectors,
60+
// number of SECTs in directory chain for 4 KB
61+
// sectors
62+
uint32 csectFat; // [2CH,04] number of SECTs in the FAT chain
63+
uint32 sectDirStart; // [30H,04] first SECT in the directory chain
64+
uint32 txSignature; // [34H,04] signature used for transactions; must
65+
// be zero. The reference implementation
66+
// does not support transactions
67+
uint32 miniSectorCutoff; // [38H,04] maximum size for a mini stream;
68+
// typically 4096 bytes
69+
uint32 sectMiniFatStart; // [3CH,04] first SECT in the MiniFAT chain
70+
uint32 csectMiniFat; // [40H,04] number of SECTs in the MiniFAT chain
71+
uint32 sectDifStart; // [44H,04] first SECT in the DIFAT chain
72+
uint32 csectDif; // [48H,04] number of SECTs in the DIFAT chain
73+
uint32 sectFat[109]; // [4CH,436] the SECTs of first 109 FAT sectors
74+
} cfbf_header_t;
75+
"""
76+
HEADER_STRUCT = "cfbf_header_t"
77+
78+
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
79+
80+
DOC = HandlerDoc(
81+
name="MSI",
82+
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
83+
handler_type=HandlerType.ARCHIVE,
84+
vendor="Microsoft",
85+
references=[
86+
Reference(
87+
title="MSI File Format Documentation",
88+
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
89+
),
90+
Reference(
91+
title="Compound File Binary Format",
92+
url="https://en.wikipedia.org/wiki/Compound_File_Binary_Format",
93+
)
94+
],
95+
limitations=[],
96+
)
97+
98+
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
99+
file.seek(start_offset)
100+
header = self.parse_header(file)
101+
102+
# Size of MSI is based on the maximum used sector. Need to walk the
103+
# DIFAT entries and find the maximum used sector to compute the size.
104+
sector_size = 2 ** header.sectorShift
105+
entries_per_sector = sector_size // 4
106+
107+
max_used_sector = 0
108+
109+
full_fat = []
110+
for i, sect in enumerate(header.sectFat):
111+
# skip empty
112+
if sect == 0xFFFFFFFF:
113+
continue
114+
115+
file.seek(start_offset + 512 + sect * sector_size)
116+
raw_sector = file.read(sector_size)
117+
entries = struct.unpack(f'<{entries_per_sector}I', raw_sector)
118+
119+
base_sector_id = i * entries_per_sector
120+
for i in range(len(entries) - 1, -1, -1):
121+
if entries[i] == 0xFFFFFFFF:
122+
continue
123+
124+
# Found the highest id on this page
125+
max_id = base_sector_id + i
126+
127+
if max_id > max_used_sector:
128+
max_used_sector = max_id
129+
130+
# Once we have found the first non-empty element, we are done
131+
# with all IDs in this sector
132+
break
133+
134+
total_size = 512 + ((max_used_sector + 1) * sector_size)
135+
136+
return ValidChunk(
137+
start_offset = start_offset,
138+
end_offset = start_offset + total_size,
139+
)

python/unblob/processing.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@
5454
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
5555
DEFAULT_SKIP_MAGIC = (
5656
"BFLT",
57-
"Composite Document File V2 Document",
57+
# Disabled for MSI files
58+
#"Composite Document File V2 Document",
5859
"Erlang BEAM file",
5960
"GIF",
6061
"GNU message catalog",

0 commit comments

Comments
 (0)